SOURCE CODE biopipen.ns.gene DOCS

"""Gene related processes"""

from ..core.proc import Proc
from ..core.config import config


class GeneNameConversion(Proc):DOCS
    """Convert gene names back and forth using MyGeneInfo

    Input:
        infile: The input file with original gene names
            It should be a tab-separated file with header

    Output:
        outfile: The output file with converted gene names

    Envs:
        notfound (choice): What to do if a conversion cannot be done.
            - use-query: Ignore the conversion and use the original name
            - skip: Ignore the conversion and skip the entire row in input file
            - ignore: Same as skip
            - error: Report error
            - na: Use NA
        dup (choice): What to do if a conversion results in multiple names.
            - first: Use the first name, sorted by matching score descendingly (default)
            - last: Use the last name, sorted by matching score descendingly
            - combine: Combine all names using `;` as separator
        genecol: The index (1-based) or name of the column where genes are present
        output (choice): How to output.
            - append: Add the converted names as new columns at the end using `envs.outfmt`
                as the column name.
            - replace: Drop the original name column, and insert
                the converted names at the original position.
            - converted: Only keep the converted names.
            - with-query: Output 2 columns with original and converted names.
        infmt: What's the original gene name format
            Available fields
            https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
        outfmt: What's the target gene name format. Currently only a single format
            is supported.
        species: Limit gene query to certain species.
            Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
            thale-cress, frog and pig
    """  # noqa: E501
    input = "infile:file"
    output = "outfile:file:{{in.infile | basename}}"
    lang = config.lang.rscript
    envs = {
        "notfound": "error",
        "genecol": 1,
        "dup": "first",
        "output": "append",
        "infmt": ["symbol", "alias"],
        "outfmt": "symbol",
        "species": "human",
    }
    script = "file://../scripts/gene/GeneNameConversion.R"


class GenePromoters(Proc):DOCS
    """Get gene promoter regions by specifying the flanking regions of TSS

    Input:
        infile: The input file with gene ids/names

    Output:
        outfile: The output file with promoter regions in BED format

    Envs:
        up (type=int): The upstream distance from TSS
        down (type=int): The downstream distance from TSS
            If not specified, the default is `envs.up`
        notfound (choice): What to do if a gene is not found.
            - skip: Skip the gene
            - error: Report error
        refgene: The reference gene annotation file in GTF format
        header (flag): Whether the input file has a header
        genecol (type=int): The index (1-based) of the gene column
        match_id (flag): Should we match the genes in `in.infile` by `gene_id`
            instead of `gene_name` in `envs.refgene`
        sort (flag): Sort the output by chromosome and start position
        chrsize: The chromosome size file, from which the chromosome order is
            used to sort the output
    """
    input = "infile:file"
    output = "outfile:file:{{in.infile | stem}}-promoters.bed"
    lang = config.lang.rscript
    envs = {
        "up": 2000,
        "down": None,
        "notfound": "error",
        "refgene": config.ref.refgene,
        "header": True,
        "genecol": 1,
        "match_id": False,
        "sort": False,
        "chrsize": config.ref.chrsize,
    }
    script = "file://../scripts/gene/GenePromoters.R"