SOURCE CODE biopipen.ns.cnvkit DOCS

"""CNVkit commnads"""

from ..core.proc import Proc
from ..core.config import config


class CNVkitAccess(Proc):DOCS
    """Calculate the sequence-accessible coordinates in chromosomes from the
    given reference genome using `cnvkit.py access`

    Input:
        excfiles: Additional regions to exclude, in BED format

    Output:
        outfile: The output file

    Envs:
        cnvkit: Path to `cnvkit.py`
        min_gap_size (type=int): Minimum gap size between accessible sequence
            regions
        ref: The reference genome fasta file

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = "excfiles:files"
    output = (
        "outfile:file:{{envs.ref | stem0}}.access.{{envs.min_gap_size}}.bed"
    )
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "min_gap_size": 5000,
        "ref": config.ref.reffa,
    }
    script = "file://../scripts/cnvkit/CNVkitAccess.py"


class CNVkitAutobin(Proc):DOCS
    """Quickly estimate read counts or depths in a BAM file to estimate
    reasonable on- and (if relevant) off-target bin sizes.

    Using `cnvkit.py autobin`.

    If multiple BAMs are given, use the BAM with median file size.

    Input:
        bamfiles: The bamfiles
        accfile: The access file
        baitfile: Potentially targeted genomic regions.
            E.g. all possible exons for the reference genome.
            Format - BED, interval list, etc.

    Output:
        target_file: The target BED output
        antitarget_file: The antitarget BED output

    Envs:
        cnvkit: Path to `cnvkit.py`
        method (choice): Sequencing protocol. Determines whether and how to use
            antitarget bins.
            - hybrid: Hybridization capture
            - amplicon: Targeted amplicon sequencing
            - wgs: Whole genome sequencing
        bp_per_bin (type=int): Desired average number of sequencing read bases
            mapped to each bin.
        target_max_size (type=int): Maximum size of target bins.
        target_min_size (type=int): Minimum size of target bins.
        antitarget_max_size (type=int): Maximum size of antitarget bins.
        antitarget_min_size (type=int): Minimum size of antitarget bins.
        annotate: Use gene models from this file to assign names to the target
            regions. Format: UCSC refFlat.txt or ensFlat.txt file (preferred),
            or BED, interval list, GFF, or similar.
        short_names (flag): Reduce multi-accession bait labels to
            be short and consistent.
        ref: The reference genome fasta file

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = "bamfiles:files, accfile:file, baitfile:file"
    output = [
        "target_file:file:{{in.bamfiles | first | stem0}}-etc.target.bed",
        "antitarget_file:file:{{in.bamfiles | first | stem0}}"
        "-etc.antitarget.bed",
    ]
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "method": "hybrid",
        "bp_per_bin": 100000,
        "target_max_size": 20000,
        "target_min_size": 20,
        "antitarget_max_size": 500000,
        "antitarget_min_size": 500,
        "annotate": None,
        "short_names": False,
        "ref": config.ref.reffa,
    }
    script = "file://../scripts/cnvkit/CNVkitAutobin.py"


class CNVkitCoverage(Proc):DOCS
    """Run cnvkit coverage

    Input:
        bamfile: The bamfile
        target_file: The target file or anti-target file

    Output:
        outfile: The output coverage file

    Envs:
        cnvkit: Path to cnvkit.py
        count (flag): Get read depths by counting read midpoints
            within each bin. (An alternative algorithm).
        min_mapq (type=int): Minimum mapping quality to include a read.
        ncores (type=int): Number of subprocesses to calculate coverage
            in parallel
        ref: The reference genome fasta file

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = "bamfile:file, target_file:file"
    output = """outfile:file:
        {%- if "antitarget" in basename(in.target_file) -%}
            {{in.bamfile | stem0}}.antitargetcoverage.cnn
        {%- else -%}
            {{in.bamfile | stem0}}.targetcoverage.cnn
        {%- endif -%}
    """
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "count": False,
        "min_mapq": 0,
        "ncores": config.misc.ncores,
        "ref": config.ref.reffa,
    }
    script = "file://../scripts/cnvkit/CNVkitCoverage.py"


class CNVkitReference(Proc):DOCS
    """Run cnvkit reference

    To genearte a reference file from normal samples, provide the cnn coverage
    files from the normal samples. To generate a flat reference file, provide
    the target/antitarget file.

    Input:
        covfiles: The coverage files from normal samples
        target_file: Target intervals (.bed or .list)
        antitarget_file: Antitarget intervals (.bed or .list)
        sample_sex: Specify the chromosomal sex of all given samples as male or
            female. Guess each sample from coverage of X and Y chromosomes if
            not given.

    Output:
        outfile: The reference cnn file

    Envs:
        cnvkit: Path to cnvkit.py
        cluster (flag): Calculate and store summary stats for
            clustered subsets of the normal samples with similar coverage
            profiles.
        min_cluster_size (type=int): Minimum cluster size to keep in reference
            profiles.
        male_reference (flag): Create a male reference: shift
            female samples chrX log-coverage by -1, so the reference chrX
            average is -1. Otherwise, shift male samples chrX by +1, so the
            reference chrX average is 0.
        no_gc (flag): Skip GC correction.
        no_edge (flag): Skip edge-effect correction.
        no_rmask (flag): Skip RepeatMasker correction.
        ref: The reference genome fasta file

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = [
        "covfiles:files",
        "target_file:file",
        "antitarget_file:file",
        "sample_sex:var",
    ]
    output = """outfile:file:
        {%- if not in.covfiles -%}
            flat.reference.cnn
        {%- else -%}
            {{in.covfiles | first | stem0 }}-etc.reference.cnn
        {%- endif -%}
    """
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "cluster": False,
        "min_cluster_size": 4,
        "male_reference": False,
        "no_gc": False,
        "no_edge": False,
        "no_rmask": False,
        "ref": config.ref.reffa,
    }
    script = "file://../scripts/cnvkit/CNVkitReference.py"


class CNVkitFix(Proc):DOCS
    """Run cnvkit.py fix

    Input:
        target_file: The target file
        antitarget_file: The antitarget file
        reference: The refence cnn file
        sample_id: Sample ID for target/antitarget files.
            Otherwise inferred from file names.

    Output:
        outfile: The fixed coverage files (.cnr)

    Envs:
        cnvkit: Path to cnvkit.py
        cluster (flag): Compare and use cluster-specific values
            present in the reference profile.
            (requires `envs.cluster=True` for `CNVkitReference`).
        no_gc (flag): Skip GC correction.
        no_edge (flag): Skip edge-effect correction.
        no_rmask (flag): Skip RepeatMasker correction.

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = (
        "target_file:file, antitarget_file:file, reference:file, sample_id:var"
    )
    output = (
        "outfile:file:{{in.sample_id | default: stem0(in.target_file)}}.cnr"
    )
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "cluster": False,
        "no_gc": False,
        "no_edge": False,
        "no_rmask": False,
    }
    script = "file://../scripts/cnvkit/CNVkitFix.py"


class CNVkitSegment(Proc):DOCS
    """Run cnvkit.py segment

    For segmentation methods, see
    https://cnvkit.readthedocs.io/en/stable/pipeline.html#segmentation-methods

    Input:
        cnrfile: The fixed coverage files (.cnr)
        vcf: VCF file name containing variants for segmentation
            by allele frequencies (optional).
        sample_id: Specify the name of the sample in the VCF to use for b-allele
            frequency extraction and as the default plot title.
        normal_id: Corresponding normal sample ID in the input VCF.
            This sample is used to select only germline SNVs to plot
            b-allele frequencies.

    Output:
        outfile: The segmentation file (.cns)

    Envs:
        cnvkit: Path to cnvkit.py
        method: Method to use for segmentation.
            Candidates - cbs, flasso, haar, none, hmm, hmm-tumor, hmm-germline
        threshold: Significance threshold (p-value or FDR, depending on method)
            to accept breakpoints during segmentation. For HMM methods,
            this is the smoothing window size.
        drop_low_coverage (flag): Drop very-low-coverage bins
            before segmentation to avoid false-positive deletions in
            poor-quality tumor samples.
        drop_outliers (type=int): Drop outlier bins more than this many
            multiples of the 95th quantile away from the average within a
            rolling window. Set to 0 for no outlier filtering.
        rscript: Path to Rscript
        ncores (type=int): Number of subprocesses to segment in parallel.
            0 or negative for all available cores
        smooth_cbs (flag): Perform an additional smoothing before
            CBS segmentation, which in some cases may increase the sensitivity.
            Used only for CBS method.
        min_variant_depth (type=int): Minimum read depth for a SNV to be
            displayed in the b-allele frequency plot.
        zygosity_freq (type=float): Ignore VCF's genotypes (GT field) and
            instead infer zygosity from allele frequencies.

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
        r-DNAcopy:
            - check: {{proc.envs.rscript}} <(echo "library(DNAcopy)")
    """
    input = "cnrfile:file, vcf:file, sample_id:var, normal_id:var"
    output = "outfile:file:{{in.cnrfile | stem0}}.cns"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "method": "cbs",
        "threshold": None,
        "drop_low_coverage": False,
        "drop_outliers": 10,
        "rscript": config.lang.rscript,
        "ncores": config.misc.ncores,
        "smooth_cbs": False,
        "min_variant_depth": 20,
        "zygosity_freq": 0.25,
    }
    script = "file://../scripts/cnvkit/CNVkitSegment.py"


class CNVkitScatter(Proc):DOCS
    """Run cnvkit.py scatter

    Input:
        cnrfile: The fixed cnr file (.cnr)
        cnsfile: The segmentation file (.cns)
        vcf: VCF file name containing variants for segmentation
            by allele frequencies (optional).
        sample_id: Specify the name of the sample in the VCF to use for b-allele
            frequency extraction and as the default plot title.
        normal_id: Corresponding normal sample ID in the input VCF.
            This sample is used to select only germline SNVs to plot
            b-allele frequencies.

    Output:
        outdir: Output directory with plots for multiple cases

    Envs:
        cnvkit: Path to cnvkit.py
        convert: Path to `convert` to convert pdf to png file
        convert_args (ns): The arguments for `convert`
            - density (type=int): Horizontal and vertical density of the image
            - quality (type=int): JPEG/MIFF/PNG compression level
            - background: Background color
            - alpha: Activate, deactivate, reset, or set the alpha channel
            - <more>: See `convert -help` and also:
                https://linux.die.net/man/1/convert
        chromosome: Chromosome or chromosomal range,
            e.g. 'chr1' or 'chr1:2333000-2444000', to display.
            If a range is given, all targeted genes in this range will be
            shown, unless -g/--gene is also given.
        gene: Name of gene or genes (comma-separated) to display.
        width (type=int): Width of margin to show around the selected gene(s)
            (-g/--gene) or small chromosomal region (-c/--chromosome).
        antitarget_marker (flag): Plot antitargets using this
            symbol when plotting in a selected chromosomal region
            (-g/--gene or -c/--chromosome).
        by_bin (flag): Plot data x-coordinates by bin indices
            instead of genomic coordinates. All bins will be shown with equal
            width, no blank regions will be shown, and x-axis values indicate
            bin number (within chromosome) instead of genomic position.
        segment_color: Plot segment lines in this color. Value can be
            any string accepted by matplotlib, e.g. 'red' or '#CC0000'.
        trend (flag): Draw a smoothed local trendline on the
            scatter plot.
        y_max (type=int): y-axis upper limit.
        y_min (tyoe=int): y-axis lower limit.
        min_variant_depth (type=int): Minimum read depth for a SNV to be
            displayed in the b-allele frequency plot.
        zygosity_freq (typ=float): Ignore VCF's genotypes (GT field) and
            instead infer zygosity from allele frequencies.
        title: Plot title. Sample ID if not provided.
        cases (type=json): The cases for different plots with keys as case names
            and values to overwrite the default args given by `envs.<args>`,
            including  `convert_args`, `by_bin`, `chromosome`, `gene`, `width`
            `antitarget_marker`, `segment_color`, `trend`, `y_max`, `y_min`,
            `min_variant_depth`, `zygosity_freq` and `title.
            By default, an `all` case will be created with default arguments
            if no case specified

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
        convert:
            - check: {{proc.envs.convert}} -version
    """
    input = (
        "cnrfile:file, cnsfile:file, config:var, "
        "vcf:file, sample_id:var, normal_id:var"
    )
    output = "outdir:dir:{{in.cnrfile | stem0}}.scatter"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "convert": config.exe.convert,
        "convert_args": {
            "density": 150,
            "quality": 90,
            "background": "white",
            "alpha": "remove",
        },
        "chromosome": None,
        "gene": None,
        "width": 1000000,
        "antitarget_marker": False,
        "by_bin": False,
        "segment_color": None,
        "trend": False,
        "y_max": None,
        "y_min": None,
        "min_variant_depth": 20,
        "zygosity_freq": 0.25,
        "title": None,
        "cases": {},
    }
    script = "file://../scripts/cnvkit/CNVkitScatter.py"
    plugin_opts = {
        "report": "file://../reports/cnvkit/CNVkitScatter.svelte",
        "report_paging": 10,
    }


class CNVkitDiagram(Proc):DOCS
    """Run cnvkit.py diagram

    Input:
        cnrfile: The fixed cnr file (.cnr)
        cnsfile: The segmentation file (.cns)
        sample_sex: Specify the sample's chromosomal sex as male or female.
            (Otherwise guessed from X and Y coverage).

    Output:
        outdir: Output directory with the scatter plots

    Envs:
        cnvkit: Path to cnvkit.py
        convert: Path to `convert` to convert pdf to png file
        convert_args (ns): The arguments for `convert`
            - density (type=int): Horizontal and vertical density of the image
            - quality (type=int): JPEG/MIFF/PNG compression level
            - background: Background color
            - alpha: Activate, deactivate, reset, or set the alpha channel
            - <more>: See `convert -help` and also:
                https://linux.die.net/man/1/convert
        threshold (type=float): Copy number change threshold to label genes.
        min_probes (type=int): Minimum number of covered probes to label a gene.
        male_reference (flag): Assume inputs were normalized to a
            male reference (i.e. female samples will have +1 log-CNR of chrX;
            otherwise male samples would have -1 chrX).
        no_shift_xy (flag): Don't adjust the X and Y chromosomes
            according to sample sex.
        title: Plot title. Sample ID if not provided.
        cases (type=json): The cases with keys as names and values as different
            configs, including `threshold`, `min_probes`, `male_reference`,
            `no_shift_xy` and `title`

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
        convert:
            - check: {{proc.envs.convert}} -version
    """
    input = "cnrfile:file, cnsfile:file, sample_sex:var"
    output = "outdir:dir:{{in.cnrfile | stem0}}.diagram"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "convert": config.exe.convert,
        "convert_args": {
            "density": 150,
            "quality": 90,
            "background": "white",
            "alpha": "remove",
        },
        "threshold": 0.5,
        "min_probes": 3,
        "male_reference": False,
        "no_shift_xy": False,
        "title": None,
        "cases": {},
    }
    script = "file://../scripts/cnvkit/CNVkitDiagram.py"
    plugin_opts = {
        "report": "file://../reports/cnvkit/CNVkitScatter.svelte",
        "report_paging": 10,
    }


class CNVkitHeatmap(Proc):DOCS
    """Run cnvkit.py heatmap for multiple cases

    Input:
        segfiles: Sample coverages as raw probes (.cnr) or segments (.cns).
        sample_sex: Specify the chromosomal sex of all given samples as male
            or female. Separated by comma. (Default: guess each sample from
            coverage of X and Y chromosomes).

    Output:
        outdir: Output directory with heatmaps of multiple cases

    Envs:
        cnvkit: Path to cnvkit.py
        convert: Path to `convert` to convert pdf to png file
        convert_args (ns): The arguments for `convert`
            - density (type=int): Horizontal and vertical density of the image
            - quality (type=int): JPEG/MIFF/PNG compression level
            - background: Background color
            - alpha: Activate, deactivate, reset, or set the alpha channel
            - <more>: See `convert -help` and also:
                https://linux.die.net/man/1/convert
        by_bin (flag): Plot data x-coordinates by bin indices
            instead of genomic coordinates. All bins will be shown with equal
            width, no blank regions will be shown, and x-axis values indicate
            bin number (within chromosome) instead of genomic position.
        chromosome: Chromosome (e.g. 'chr1') or chromosomal range
            (e.g. 'chr1:2333000-2444000') to display.
        desaturate (flag): Tweak color saturation to focus on
            significant changes.
        male_reference (flag): Assume inputs were normalized to
            a male reference. (i.e. female samples will have +1 log-CNR of chrX;
            otherwise male samples would have -1 chrX).
        no_shift_xy (flag): Don't adjust the X and Y chromosomes
            according to sample sex.
        order: A file with sample names in the desired order.
        cases (type=json): The cases for different plots with keys as case names
            and values to overwrite the default args given by `envs.<args>`,
            including `convert_args`, `by_bin`, `chromosome`, `desaturate`,
            `male_reference`, and, `no_shift_xy`.
            By default, an `all` case will be created with default arguments
            if no case specified

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
        convert:
            - check: {{proc.envs.convert}} -version
    """
    input = "segfiles:files, sample_sex: var"
    output = "outdir:dir:{{in.segfiles | first | stem0}}-etc.heatmap"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "convert": config.exe.config,
        "convert_args": {
            "density": 150,
            "quality": 90,
            "background": "white",
            "alpha": "remove",
        },
        "by_bin": False,
        "chromosome": False,
        "desaturate": False,
        "male_reference": False,
        "no_shift_xy": False,
        "order": None,
        "cases": {},
    }
    script = "file://../scripts/cnvkit/CNVkitHeatmap.py"
    plugin_opts = {"report": "file://../reports/cnvkit/CNVkitHeatmap.svelte"}


class CNVkitCall(Proc):DOCS
    """Run cnvkit.py call

    Input:
        cnrfile: The fixed cnr file (.cnr), used to generate VCF file
        cnsfile: The segmentation file (.cns)
        vcf: VCF file name containing variants for segmentation
            by allele frequencies (optional).
        sample_id: Specify the name of the sample in the VCF to use for b-allele
            frequency extraction and as the default plot title.
        normal_id: Corresponding normal sample ID in the input VCF.
            This sample is used to select only germline SNVs to plot
            b-allele frequencies.
        sample_sex: Specify the sample's chromosomal sex as male or female.
            (Otherwise guessed from X and Y coverage).
        purity: Estimated tumor cell fraction, a.k.a. purity or cellularity.

    Output:
        outdir: The output directory including the call file (.call.cns)
            bed file, and the vcf file

    Envs:
        cnvkit: Path to cnvkit.py
        center: Re-center the log2 ratio values using this estimator of
            the center or average value.
        center_at (type=float): Subtract a constant number from all log2 ratios.
            For "manual" re-centering, in case the --center option gives
            unsatisfactory results.)
        filter: Merge segments flagged by the specified
            filter(s) with the adjacent segment(s).
        method (choice): Calling method (threshold, clonal or none).
            - threshold: Using hard thresholds for calling each integer copy
                number.
                Use `thresholds` to set a list of threshold log2 values for
                each copy number state
            - clonal: Rescaling and rounding.
                For a given known tumor cell fraction and normal ploidy,
                then simple rounding to the nearest integer copy number
            - none: Do not add a “cn” column or allele copy numbers.
                But still performs rescaling, re-centering, and extracting
                b-allele frequencies from a VCF (if requested).
        thresholds: Hard thresholds for calling each integer copy number,
            separated by commas.
        ploidy (type=float): Ploidy of the sample cells.
        drop_low_coverage (flag): Drop very-low-coverage bins
            before segmentation to avoid false-positive deletions in
            poor-quality tumor samples.
        male_reference (flag): Assume inputs were normalized to a
            male reference.
            (i.e. female samples will have +1 log-CNR of chrX; otherwise
            male samples would have -1 chrX).
        min_variant_depth (type=int): Minimum read depth for a SNV to be
            displayed in the b-allele frequency plot.
        zygosity_freq (type=float): Ignore VCF's genotypes (GT field) and
            instead infer zygosity from allele frequencies.

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
    """
    input = [
        "cnrfile:file",
        "cnsfile:file",
        "vcf:file",
        "sample_id:var",
        "normal_id:var",
        "sample_sex:var",
        "purity:var",
    ]
    output = "outdir:dir:{{in.cnsfile | stem0}}.cnvkit"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "center": "median",
        "center_at": None,
        "filter": None,
        "method": "threshold",
        "thresholds": "-1.1,-0.25,0.2,0.7",
        "ploidy": 2,
        "drop_low_coverage": False,
        "male_reference": False,
        "min_variant_depth": 20,
        "zygosity_freq": 0.25,
    }
    script = "file://../scripts/cnvkit/CNVkitCall.py"


class CNVkitBatch(Proc):DOCS
    """Run cnvkit batch

    If you need in-depth control of the parameters, for example, multiple
    scatter plots in different regions, or you need to specify sample-sex for
    different samples, take a look at `biopipen.ns.cnvkit_pipeline`

    Input:
        metafile: The meta data file containing the sample information
            Two columns BamFile and `envs.type_col` are required.
            The tumor samples should be labeled as `envs.type_tumor` and the
            normal samples should be labeled as `envs.type_normal` in the
            `envs.type_col` column. If normal samples are not found, a
            flat reference will be used.
            The could be other columns in the meta file, but they could be
            used in `biopipen.ns.cnvkit_pipeline`.

    Output:
        outdir: The output directory

    Envs:
        cnvkit: Path to cnvkit.py
        method: Sequencing assay type: hybridization capture ('hybrid'),
            targeted amplicon sequencing ('amplicon'), or whole genome
            sequencing ('wgs'). Determines whether and how to use antitarget
            bins.
        segment_method: cbs,flasso,haar,none,hmm,hmm-tumor,hmm-germline
            Method used in the 'segment' step.
        male_reference: Use or assume a male reference (i.e. female samples
            will have +1 log-CNR of chrX; otherwise male samples would have
            -1 chrX).
        count_reads: Get read depths by counting read midpoints within each bin.
            (An alternative algorithm).
        drop_low_coverage: Drop very-low-coverage bins before segmentation to
            avoid false-positive deletions in poor-quality tumor samples.
        ncores: Number of subprocesses used to running each of the BAM files
            in parallel
        rscript: Path to the Rscript excecutable to use for running R code.
            Use this option to specify a non-default R installation.
        ref: Path to a FASTA file containing the reference genome.
        targets: Target intervals (.bed or .list) (optional for wgs)
        antitargets: Anti-target intervals (.bed or .list) (optional for wgs)
        annotate: Use gene models from this file to assign names to the
            target regions. Format: UCSC refFlat.txt or ensFlat.txt file
            (preferred), or BED, interval list, GFF, or similar.
        short_names: Reduce multi-accession bait labels to be short
            and consistent.
        target_avg_size: Average size of split target bins
            (results are approximate).
        access: Regions of accessible sequence on chromosomes (.bed),
            as output by the 'access' command.
        access_min_gap_size: Minimum gap size between accessible
            sequence regions if `envs.access` is not specified.
        access_excludes: Exclude these regions from the accessible genome
            Used when `envs.access` is not specified.
        antitarget_avg_size: Average size of antitarget bins
            (results are approximate).
        antitarget_min_size: Minimum size of antitarget bins
            (smaller regions are dropped).
        cluster: Calculate and use cluster-specific summary stats in the
            reference pool to normalize samples.
        reference: Copy number reference file (.cnn) to reuse
        scatter: Create a whole-genome copy ratio profile as a PDF scatter plot.
        diagram: Create an ideogram of copy ratios on chromosomes as a PDF.
        type_col: type_col: The column name in the metafile that
            indicates the sample type.
        type_tumor: The type of tumor samples in `envs.type_col` column of
            `in.metafile`
        type_normal: The type of normal samples in `envs.type_col` column of
            `in.metafile`

    Requires:
        cnvkit:
            - check: {{proc.envs.cnvkit}} version
        r-DNAcopy:
            - check: {{proc.envs.rscript}} <(echo "library(DNAcopy)")
    """
    input = "metafile:file"
    output = "outdir:dir:{{in.metafile | stem0}}.cnvkit"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "method": "hybrid",
        "segment_method": "cbs",
        "male_reference": False,
        "count_reads": False,
        "drop_low_coverage": False,
        "ncores": config.misc.ncores,
        "rscript": config.lang.rscript,
        "ref": config.ref.reffa,
        "targets": False,
        "antitargets": False,
        "annotate": False,
        "short_names": False,
        "target_avg_size": False,
        "access": False,
        "access_min_gap_size": 5000,
        "access_excludes": False,
        "antitarget_avg_size": False,
        "antitarget_min_size": False,
        "cluster": False,
        "reference": False,
        "scatter": True,
        "diagram": True,
        "type_tumor": "Tumor",
        "type_normal": "Normal",
        "type_col": "SampleType",
    }
    script = "file://../scripts/cnvkit/CNVkitBatch.py"


class CNVkitGuessBaits(Proc):DOCS
    """Guess the bait intervals from the bam files

    It runs scripts/guess_baits.py from the cnvkit repo.

    Input:
        bamfiles: The bam files
        atfile: The potential target file or access file
            e.g. all known exons in the reference genome or
            from `cnvkit.py access`

    Output:
        targetfile: The target file

    Envs:
        cnvkit: Path to cnvkit.py
        guided (flag): `in.atfile` is a potential target file when
            `True`, otherwise it is an access file.
        samtools: Path to samtools executable
        ncores (type=int): Number of subprocesses to segment in parallel
            `0` to use the maximum number of available CPUs.
        ref: Path to a FASTA file containing the reference genome.
        min_depth (type=int): Minimum sequencing read depth to accept as
            captured. For guided only.
        min_gap (type=int): Merge regions separated by gaps smaller than this.
        min_length (type=int): Minimum region length to accept as captured.
            `min_gap` and `min_length` are for unguided only.
    """
    input = "bamfiles:files, atfile:file"
    output = "targetfile:file:{{in.bamfiles | first | stem}}_etc.baits.bed"
    lang = config.lang.python
    envs = {
        "cnvkit": config.exe.cnvkit,
        "samtools": config.exe.samtools,
        "ncores": config.misc.ncores,
        "ref": config.ref.reffa,
        "guided": None,
        "min_depth": 5,
        "min_gap": 25,
        "min_length": 50,
    }
    script = "file://../scripts/cnvkit/CNVkitGuessBaits.py"