#!/usr/bin/env bash if [ $# -lt 3 ]; then echo "usage: $0 [regions file] [ncpus] [freebayes arguments]" echo echo "Run freebayes in parallel over regions listed in regions file, using ncpus processors." echo "Will merge and sort output, producing a uniform VCF stream on stdout. Flags to freebayes" echo "which would write to e.g. a particular file will obviously cause problms, so caution is" echo "encouraged when using this script." echo echo "examples:" echo echo "Run freebayes in parallel on 100000bp chunks of the ref (fasta_generate_regions.py is also" echo "located in the scripts/ directory in the freebayes distribution). Use 36 threads." echo echo " freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 -f ref.fa aln.bam >out.vcf" echo echo "Generate regions that are equal in terms of data content, and thus have lower variance" echo "in runtime. This will yield better resource utilization." echo echo " bamtools coverage -in aln.bam | coverage_to_regions.py ref.fa.fai 500 >ref.fa.500.regions" echo " freebayes-parallel ref.fa.500.regions 36 -f ref.fa aln.bam >out.vcf" echo exit fi regionsfile=$1 shift ncpus=$1 shift command=("freebayes" "$@") ( #$command | head -100 | grep "^#" # generate header # iterate over regions using gnu parallel to dispatch jobs cat "$regionsfile" | parallel -k -j "$ncpus" "${command[@]}" --region {} ) | vcffirstheader \ | vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges