"""Tools to handle VCF files"""
from ..core.proc import Proc
from ..core.config import config
class VcfLiftOver(Proc):DOCS
"""Liftover a VCF file using GATK
Input:
invcf: The input VCF file
Output:
outvcf: The output VCF file
Envs:
gatk: The path to gatk4, which should be installed via conda
chain: The map chain file for liftover
tmpdir: Directory for temporary storage of working files
args: Other CLI arguments for `gatk LiftoverVcf`
"""
input = "invcf:file"
output = "outvcf:file:{{in.invcf | basename}}"
envs = {
"gatk": config.exe.gatk4,
"chain": config.path.liftover_chain,
"tmpdir": config.path.tmpdir,
"reffa": config.ref.reffa,
"args": {},
}
lang = config.lang.bash
script = "file://../scripts/vcf/VcfLiftOver.sh"
class VcfFilter(Proc):DOCS
"""Filter records in vcf file
Input:
invcf: The input vcf file, could be bgzipped.
Output:
outfile: The filtered vcf file. If `in.invcf` is bgzipped, then
this will be bgzipped.
Envs:
filters: A dict of filters with keys the filter names.
>>> # Typically
>>> lambda variant: <expression>
Things to notice
1. Filters should return `False` to get variant filtered out
2. See https://brentp.github.io/cyvcf2/docstrings.html#cyvcf2.cyvcf2.Variant
For what you can do with the variant
3. The filter python functions should be in string representation
4. Builtin filters can have parameters `{"QUAL": 30}`
5. List of builtin filters. Specify them like: `{"FILTER": params}`
`SNPONLY`: keeps only SNPs (`{"SNPONLY": False}` to filter SNPs out)
`QUAL`: keeps variants with QUAL>=param (`{"QUAL": (30, False)}`)
to keep only variants with QUAL<30
filter_descs: Descriptions for the filters. Will be saved to the header
of the output vcf file
helper: Some helper code for the filters
keep: Keep the variants not passing the filters?
""" # noqa: E501
input = "invcf:file"
output = "outfile:file:{{in.invcf | basename}}"
lang = config.lang.python
envs = {
"filters": {},
"keep": True,
"helper": "",
"filter_descs": {},
}
script = "file://../scripts/vcf/VcfFilter.py"
class VcfIndex(Proc):DOCS
"""Index VCF files. If they are already index, use the index files
Input:
infile: The input VCF file
Output:
outfile: The output VCF file (bgzipped)
outidx: The index file of the output VCF file
Envs:
tabix: Path to tabix
"""
input = "infile:file"
output = """
{%- if in.infile.endswith(".gz") %}
outfile:file:{{in.infile | basename}},
outidx:file:{{in.infile | basename | append: ".tbi"}}
{%- else -%}
outfile:file:{{in.infile | basename | append: ".gz"}},
outidx:file:{{in.infile | basename | append: ".gz.tbi"}}
{% endif -%}
"""
lang = config.lang.python
envs = {
"tabix": config.exe.tabix,
"ncores": config.misc.ncores,
}
script = "file://../scripts/vcf/VcfIndex.py"
class Vcf2Bed(Proc):DOCS
"""Convert Vcf file to Bed file
Input:
infile: The vcf file
Output:
outfile: The converted bed file
Envs:
inbase: The coordinate base of the vcf file
outbase: The coordinate base of the base file
Requires:
cyvcf2:
- check: {{proc.lang}} -c "import cyvcf2"
"""
input = "infile:file"
output = "outfile:file:{{in.infile | stem0}}.bed"
lang = config.lang.python
envs = {"inbase": 1, "outbase": 0}
script = "file://../scripts/vcf/Vcf2Bed.py"
class VcfDownSample(Proc):DOCS
"""Down-sample VCF files to keep only a subset of variants in there
Input:
infile: The input VCF file
Output:
outfile: The output VCF file with subet variants
Gzipped if `in.infile` is gzipped
Envs:
n: Fraction/Number of variants to keep
If `n > 1`, it is the number.
If `n <= 1`, it is the fraction.
"""
input = "infile:file"
output = "outfile:file:{{in.infile | basename}}"
envs = {"n": 0}
lang = config.lang.bash
script = "file://../scripts/vcf/VcfDownSample.sh"
class VcfSplitSamples(Proc):DOCS
"""Split a VCF file into multiple VCF files, one for each sample
Input:
infile: The input VCF file
Output:
outdir: The output directory containing the split VCF files
Envs:
bcftools: Path to bcftools
gz: Gzip the output VCF files? Has to be True if `envs.index` is True
index: Index the output VCF files?
ncores: Number of cores, used to extract samples, but not to index
private: Keep sites where only the sample carries an non-ref allele.
That means, sites with genotypes like `0/0` will be removed.
"""
input = "infile:file"
output = "outdir:dir:{{in.infile | stem}}.splitsamples"
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"gz": True,
"index": True,
"ncores": config.misc.ncores,
"private": True,
}
script = "file://../scripts/vcf/VcfSplitSamples.py"
class VcfIntersect(Proc):DOCS
"""Find variants in both VCF files
Input:
infile1: The first VCF file
infile2: The second VCF file
Output:
outfile: The output VCF file with subet variants in both files
Envs:
bcftools: Path to bcftools
gz: Gzip the output VCF files? Has to be True if `envs.index` is True
index: Index the output VCF files?
keep_as: Keep the variants as presented in the first (0) or
the second (1) file?
collapse: How to match the variants in the two files? Will be passed to
`bcftools isec -c` option. See also
https://samtools.github.io/bcftools/bcftools.html#common_options
- none: only records with identical REF and ALT alleles are
compatible
- some: only records where some subset of ALT alleles match are
compatible
- all: all records are compatible, regardless of whether the ALT
alleles match or not.
- snps: any SNP records are compatible, regardless of whether the
ALT alleles match or not.
- indels: any indel records are compatible, regardless of whether
the ALT alleles match or not.
- both: abbreviates `snps` and `indels`
- id: only records with identical ID are compatible
"""
input = "infile1:file, infile2:file"
output = """
outfile:file:{{in.infile1 | stem0}}.intersect.{{in.infile2 | stem0}}.vcf
{%- if envs.gz -%}.gz{%- endif -%}
"""
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"gz": True,
"index": True,
"collapse": "all",
}
script = "file://../scripts/vcf/VcfIntersect.py"
class VcfFix(Proc):DOCS
"""Fix some issues with VCF files
Input:
infile: The input VCF file
Output:
outfile: The output VCF file
Envs:
fixes: A list of fixes to apply.
Each one is a dict with keys `kind`, `id`, `regex` and `fix`
`kind`: The kind of fix. Including
`filter` the FILTERs in the header,
`info` the info INFOs in the header,
`contig` the contig lines in the header
`format` the FORMATs in the header,
`colnames` the column names in the header
`header` general header item
`variant` the variants
`None` matches everything
`id`: The ID the match. If `kind` is `filter`, `info`, `contig`
or `format`, then it matches the `ID` of the item. If `kind`
is `variant`, then it matches the `ID` of the variant.
If a list is given, then it matches any of the IDs in the list.
`regex`: The regular expression to match. When `id` is given,
this is ignored.
`append`: Whether to append a record instead of to replace an
existing one. When it is True, `kind` has to not be `None`
`fix`: The fix to apply in the format of a lambda function
(in string), with a single argument.
The function should either return a string (raw representation)
for the record, the record itself, `None`, `False`.
If `None` is returned, the original record is used. if `False`,
the record is removed.
If `append` is `True`, then the function should either return
a string or an object. And the argument is `None`
The argument is a different object based on different `kind`s.
When `kind` is `None`, the argument is the plain line of the
record with line ending.
When `kind` is `info` or `format`, the record is a dict with
keys `ID`, `Description`, `Type` and `Number`.
When `kind` is `filter`, the record is a dict with keys
`ID` and `Description`.
When `kind` is `contig`, the record is a dict with keys `ID`
and `length`.
When `kind` is `header`, the record is a dict
with `key` the name of the header and `value` the value of the
header.
When `kind` is `colnames`, the record is a list of column names.
When `kind` is `variant`, the record is a dict with
keys `CHROM`, `POS`, `REF`, `ALT`, `QUAL`, `FILTER`, `INFO`,
`FORMAT` and `SAMPLES`. `INFO` is a dict with key-value pairs
and `SAMPLES` are a list of values for each sample. Each value
is also a list of values for each FORMAT.
If a record matches multiple fixes, the first one is applied.
helpers: raw code the provide some helpers for the fixes
The code will automatically dedented if given as a string. A list
of strings is also supported and will be joined with newlines.
"""
input = "infile:file"
output = "outfile:file:{{in.infile | basename}}"
lang = config.lang.python
envs = {"fixes": [], "helpers": ""}
script = "file://../scripts/vcf/VcfFix.py"
class VcfAnno(Proc):DOCS
"""Annotate a VCF file using vcfanno
https://github.com/brentp/vcfanno
Input:
infile: The input VCF file
conffile: The configuration file for vcfanno or configuration dict
itself
Output:
outfile: The output VCF file
Envs:
vcfanno: Path to vcfanno
ncores: Number of cores to use
conffile: configuration file for vcfanno or configuration dict itself
This is ignored when `conffile` is given as input
args: Additional arguments to pass to vcfanno
Requires:
- name: vcfanno
check: |
{{proc.envs.vcfanno}} --help
"""
input = "infile:file, conffile"
output = "outfile:file:{{in.infile | stem0}}.{{envs.tool}}.vcf"
lang = config.lang.python
envs = {
"vcfanno": config.exe.vcfanno,
"ncores": config.misc.ncores,
"conffile": {},
"args": {"permissive-overlap": True},
}
script = "file://../scripts/vcf/VcfAnno.py"
class TruvariBench(Proc):DOCS
"""Run `truvari bench` to compare a VCF with CNV calls and
base CNV standards
Requires truvari v4+
See https://github.com/ACEnglish/truvari/wiki/bench
Input:
compvcf: The VCF file with CNV calls to compare
basevcf: The VCF file with standard CNVs
Output:
outdir: The output directory
Envs:
truvari: Path to truvari
`<other>`: Ohter `truvari bench` arguments
Requires:
truvari:
- check: {{proc.envs.truvari}} version
"""
input = "compvcf:file, basevcf:file"
output = "outdir:dir:{{in.compvcf | stem0 | append: '.truvari_bench'}}"
envs = {
"truvari": config.exe.truvari,
"ref": config.ref.reffa,
"refdist": 500,
"pctseq": 0.7,
"pctsize": 0.7,
"pctovl": 0.0,
"typeignore": False,
"multimatch": False,
}
lang = config.lang.bash
script = "file://../scripts/vcf/TruvariBench.sh"
class TruvariBenchSummary(Proc):DOCS
"""Summarise the statistics from `TruvariBench` for multiple jobs (VCFs)
Input:
indirs: The input directories, which should be the output directories
of `TruvariBench`
Output:
outdir: The output directory, including the summary table and plots
Envs:
plots: The stats to plot with barplots.
Candidates are `TP-base`, `TP-call`, `FP`, `FN`, `precision`,
`recall`, `f1`, `base cnt`, `call cnt`, `TP-call_TP-gt`,
`TP-call_FP-gt`, `TP-base_TP-gt`, `TP-base_FP-gt`, and
`gt_concordance`
See https://github.com/ACEnglish/truvari/wiki/bench
devpars: The parameters to use for the plots.
Requires:
r-ggprism:
- check: {{proc.lang}} -e "library(ggprism)"
r-rjson:
- check: {{proc.lang}} -e "library(rjson)"
r-dplyr:
- check: {{proc.lang}} -e "library(dplyr)"
r-ggplot2:
- check: {{proc.lang}} -e "library(ggplot2)"
"""
input = "indirs:files"
input_data = lambda ch: [list(ch.iloc[:, 0])]
output = "outdir:dir:truvari_bench.summary"
lang = config.lang.rscript
envs = {
"plots": ["comp cnt", "base cnt", "precision", "recall", "f1"],
"devpars": None,
}
script = "file://../scripts/vcf/TruvariBenchSummary.R"
plugin_opts = {"report": "file://../reports/vcf/TruvariBenchSummary.svelte"}
class TruvariConsistency(Proc):DOCS
"""Run `truvari consistency` to check consistency of CNV calls
See https://github.com/ACEnglish/truvari/wiki/consistency
Requires truvari v4+
Input:
vcfs: The vcf files with CNV calls
Output:
outfile: The output file with the report
Envs:
truvari: Path to truvari
heatmap: Whether to generate a heatmap of the consistency
Set to False to disable
annofile: The annotation file for the heatmap, multiple columns
but the first column must be the sample name. Note that the stem of
the vcf file name from consistency file will be used. These
annotations will be added as row annotations.
Other options see also `biopipen.ns.plot.Heatmap`.
"""
input = "vcfs:files"
output = (
"outdir:dir:"
"{{in.vcfs | first | stem0 | append: '.etc.truvari_consistency'}}"
)
lang = config.lang.rscript
envs = {"truvari": config.exe.truvari, "heatmap": {}}
script = "file://../scripts/vcf/TruvariConsistency.R"
plugin_opts = {"report": "file://../reports/vcf/TruvariConsistency.svelte"}
class BcftoolsAnnotate(Proc):DOCS
"""Add or remove annotations from VCF files
See also: <https://samtools.github.io/bcftools/bcftools.html#annotate>
Input:
infile: The input VCF file
annfile: The annotation file.
Currently only VCF files are supported.
Output:
outfile: The VCF file with annotations added or removed.
Envs:
bcftools: Path to bcftools
tabix: Path to tabix, used to index infile and annfile
annfile: The annotation file. If `in.annfile` is provided,
this is ignored
ncores (type=int): Number of cores (`--threads`) to use
columns (auto): Comma-separated or list of columns or tags to carry over from
the annotation file. Overrides `-c, --columns`
remove (auto): Remove the specified columns from the input file
header (list): Headers to be added
gz (flag): Whether to gzip the output file
index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
<more>: Other arguments for `bcftools annotate`
See also <https://samtools.github.io/bcftools/bcftools.html#annotate>
Note that the underscore `_` will be replaced with dash `-` in the
argument name.
"""
input = "infile:file, annfile:file"
output = (
"outfile:file:{{in.infile | stem: 'gz'}}.vcf"
"{{'.gz' if envs.index or envs.gz else ''}}"
)
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"tabix": config.exe.tabix,
"annfile": None,
"columns": [],
"remove": [],
"header": [],
"gz": True,
"index": True,
"ncores": config.misc.ncores,
}
script = "file://../scripts/vcf/BcftoolsAnnotate.py"
class BcftoolsFilter(Proc):DOCS
"""Apply fixed threshold filters to VCF files
Input:
infile: The input VCF file
Output:
outfile: The filtered VCF file. If the `in.infile` is gzipped, this is
gzipped as well.
Envs:
bcftools: Path to bcftools
tabix: Path to tabix, used to index infile/outfile
ncores (type=int): Number of cores (`--threads`) to use
keep: Whether we should keep the filtered variants or not.
If True, the filtered variants will be kept in the output file, but
with a new FILTER.
includes: and
excludes: include/exclude only sites for which EXPRESSION is true.
See: <https://samtools.github.io/bcftools/bcftools.html#expressions>
If provided, `envs.include/exclude` will be ignored.
If `str`/`list` used, The filter names will be `Filter_<type>_<index>`.
A dict is used where keys are filter names and values are expressions
gz (flag): Whether to gzip the output file
index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
<more>: Other arguments for `bcftools filter`
See also <https://samtools.github.io/bcftools/bcftools.html#filter>
"""
input = "infile:file"
output = (
"outfile:file:{{in.infile | stem: 'gz'}}.vcf"
"{{'.gz' if envs.index or envs.gz else ''}}"
)
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"tabix": config.exe.tabix,
"ncores": config.misc.ncores,
"keep": True,
"includes": None,
"excludes": None,
"gz": True,
"index": True,
}
script = "file://../scripts/vcf/BcftoolsFilter.py"
class BcftoolsSort(Proc):DOCS
"""Sort VCF files using `bcftools sort`.
`bcftools sort` is used to sort VCF files by chromosome and position based on the
order of contigs in the header.
Here we provide a chrsize file to first sort the contigs in the header and then
sort the VCF file using `bcftools sort`.
Input:
infile: The input VCF file
Output:
outfile: The sorted VCF file.
Envs:
bcftools: Path to bcftools
tabix: Path to tabix, used to index infile/outfile
ncores (type=int): Number of cores (`--threads`) to use
gz (flag): Whether to gzip the output file
index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
chrsize: The chromosome size file, from which the chromosome order is used
to sort the contig in the header first.
If not provided, `bcftools sort` will be used directly.
notfound (choice): What if the contig in the VCF file is not found in the
`chrsize` file.
- error: Report error
- remove: Remove the contig from the header.
Note that if there are records with the removed contig, an error will
be raised by `bcftools sort`
- start: Move the contig to the start of the contigs from `chrsize`
- end: Move the contig to the end of the contigs from `chrsize`
<more>: Other arguments for `bcftools sort`. For example `max_mem`.
See also <https://samtools.github.io/bcftools/bcftools.html#sort>
"""
input = "infile:file"
output = (
"outfile:file:{{in.infile | stem: 'gz'}}.vcf"
"{{'.gz' if envs.index or envs.gz else ''}}"
)
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"tabix": config.exe.tabix,
"ncores": config.misc.ncores,
"chrsize": config.ref.chrsize,
"notfound": "remove",
"gz": True,
"index": True,
}
script = "file://../scripts/vcf/BcftoolsSort.py"
class BcftoolsView(Proc):DOCS
"""View, subset and filter VCF files by position and filtering expression.
Also convert between VCF and BCF.
Input:
infile: The input VCF file
regions_file: The region file used to subset the input VCF file.
samples_file: The samples file used to subset the input VCF file.
Output:
outfile: The output VCF file.
Envs:
bcftools: Path to bcftools
tabix: Path to tabix, used to index infile/outfile
ncores (type=int): Number of cores (`--threads`) to use
regions_file: The region file used to subset the input VCF file.
If `in.regions_file` is provided, this is ignored.
samples_file: The samples file used to subset the input VCF file.
If `in.samples_file` is provided, this is ignored.
gz (flag): Whether to gzip the output file
index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
<more>: Other arguments for `bcftools view`.
See also https://samtools.github.io/bcftools/bcftools.html#view
Note that the underscore `_` will be replaced with dash `-` in the
argument name.
"""
input = "infile:file, regions_file:file, samples_file:file"
output = (
"outfile:file:{{in.infile | stem: 'gz'}}.vcf"
"{{'.gz' if envs.index or envs.gz else ''}}"
)
lang = config.lang.python
envs = {
"bcftools": config.exe.bcftools,
"tabix": config.exe.tabix,
"ncores": config.misc.ncores,
"regions_file": None,
"samples_file": None,
"gz": True,
"index": True,
}
script = "file://../scripts/vcf/BcftoolsView.py"