```sh
---------- updated ------------------------
Step 3: replace the readgroup in the bams
S1/unique/S1.realigned.bam
S2/unique/S2.realigned.bam
S3/unique/S3.realigned.bam
S4/unique/S4.realigned.bam
S5/unique/S5.realigned.bam
S6/unique/S6.realigned.bam
S7/unique/S7.realigned.bam
S8/unique/S8.realigned.bam
TOOL: man picard-tools
BEFORE=(ID:haploid SM:3-va18942-wt PL:ILLUMINA) vs AFTER=(RGPL=illumina ID:3-va18942-wt SM:3-va18942-wt LB=standard PU=pu5)
3-va18942-wt.bam: RGPL=illumina RGID=3-va18942-wt RGSM=3-va18942-wt RGLB=standard RGPU=pu3
picard-tools AddOrReplaceReadGroups I=S1.realigned.bam O=S1_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S1 RGSM=S1 RGLB=standard RGPU=s1 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S2/unique/S2.realigned.bam O=S2_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S2 RGSM=S2 RGLB=standard RGPU=s2 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S3/unique/S3.realigned.bam O=S3_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S3 RGSM=S3 RGLB=standard RGPU=s3 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S4/unique/S4.realigned.bam O=S4_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S4 RGSM=S4 RGLB=standard RGPU=s4 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S5/unique/S5.realigned.bam O=S5_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S5 RGSM=S5 RGLB=standard RGPU=s5 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S6/unique/S6.realigned.bam O=S6_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S6 RGSM=S6 RGLB=standard RGPU=s6 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S7/unique/S7.realigned.bam O=S7_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S7 RGSM=S7 RGLB=standard RGPU=s7 VALIDATION_STRINGENCY=LENIENT
picard-tools AddOrReplaceReadGroups I=S8/unique/S8.realigned.bam O=S8_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=S8 RGSM=S8 RGLB=standard RGPU=s8 VALIDATION_STRINGENCY=LENIENT
https://www.biostars.org/p/16242/
method 1: merge the readgroup-added bams into the file merged_picard.bam ==> ERROR
picard-tools MergeSamFiles INPUT=3-va18942-wt.bam INPUT=4-va18942-scv.bam OUTPUT=merged_picard.bam
method 2 ==> merged_samtools.bam
samtools view -H merged_picard.bam > header.sam
samtools merge -h header.sam merged_samtools.bam 3-va18942-wt.bam 4-va18942-scv.bam
check if the merged_picard is fine
samtools view -c merged_samtools.bam
variant calling using GATK + filtering
samtools index merged_samtools.bam // ~/Fastqs/E_faecalis_va18942_vc/trimmed_CROP300_V583/Phylo/bams/merged_samtools.bam
Direkt SNP calling is not suggested. It is better if we take a realigning step before SNP and Indel-calling.
java -jar /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T UnifiedGenotyper -glm SNP -R ~/DATA/Data_Anna4_SNP/Raw_Data_E_faecalis_va18942_vc/trimmed_CROP300_V583/E_faecalis_V583.fasta -I merged_samtools.bam -o master_GATK.vcf -ploidy 1
==> ERROR: Expected 542 but only received 275; BinaryCodec in readmode; file:
realigning after merging
samtools index merged_samtools.bam
samtools sort merged_samtools.bam merged_picard_sorted
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T RealignerTargetCreator -I merged_samtools.bam -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_E_faecalis_va18942_vc/trimmed_CROP300_V583/E_faecalis_V583.fasta -o realigned.merged.bam.list
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T IndelRealigner -I merged_samtools.bam -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_E_faecalis_va18942_vc/trimmed_CROP300_V583/E_faecalis_V583.fasta -targetIntervals realigned.merged.bam.list -o realigned.merged.realigned.bam
java -jar /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -glm SNP -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_E_faecalis_va18942_vc/trimmed_CROP300_V583/E_faecalis_V583.fasta -T UnifiedGenotyper -I realigned.merged.realigned.bam -o merged.snps.raw.vcf -ploidy 1
java -jar /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -glm INDEL -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_E_faecalis_va18942_vc/trimmed_CROP300_V583/E_faecalis_V583.fasta -T UnifiedGenotyper -I realigned.merged.realigned.bam -o merged.indels.raw.vcf -ploidy 1
-- until here --
-L sorted.bed
http://gatkforums.broadinstitute.org/gatk/discussion/2498/calling-snps-indel-by-unified-genotyper
$java -Xmx100g -jar GenomeAnalysisTK.jar -T UnifiedGenotyper -R umd3.1_genome.fa
-I Sample_1_rcal.bam -I Sample_2_rcal.bam -I Sample_3_rcal.bam -I Sample_4_rcal.bam
--dbsnp cow_dbsnp133_snps_indels.vcf -o snps.raw.vcf
--min_base_quality_score 30 -stand_call_conf 50 – stand_emit_conf 10
-dcov 200
Filtering
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T VariantFiltration -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -o merged.snps.marked.vcf -V merged.snps.raw.vcf --clusterSize 3 --clusterWindowSize 10 --filterExpression "MLEAF < 0.95" --filterName "AFFilter" --filterExpression "QD < 10.0" --filterName "QDFilter" --filterExpression "MQ < 30.0" --filterName "MQFilter" --filterExpression "FS > 10.0" --filterName "FSFilter" --filterExpression "HaplotypeScore > 20.0" --filterName "HaplotypeScoreFilter" --filterExpression "QUAL < 30.0" --filterName "QualFilter" --filterExpression "MQ0 >= 4 && '((MQ0 / (1.0 * DP))>0.1)'" --filterName "HARD_TO_VALIDATE"
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T VariantFiltration -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -o merged.snps.marked.vcf -V merged.snps.raw.vcf --clusterSize 3 --clusterWindowSize 10 --filterExpression "MLEAF < 0.95" --filterName "AFFilter" --filterExpression "QD < 10.0" --filterName "QDFilter" --filterExpression "MQ < 30.0" --filterName "MQFilter" --filterExpression "FS > 10.0" --filterName "FSFilter" --filterExpression "HaplotypeScore > 20.0" --filterName "HaplotypeScoreFilter" --filterExpression "QUAL < 30.0 || DP < (288.06/2) || DP > (288.06*3)" --filterName "StandardFilters" --filterExpression "MQ0 >= 4 && '((MQ0 / (1.0 * DP))>0.1)'" --filterName "HARD_TO_VALIDATE"
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T VariantFiltration -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -o merged.snps.AMB.vcf -V merged.snps.raw.vcf --clusterSize 3 --clusterWindowSize 10 --filterExpression "MLEAF < 0.95" --filterName "FAIL" --filterExpression "QD < 10.0" --filterName "FAIL1" --filterExpression "MQ < 30.0" --filterName "FAIL2" --filterExpression "FS > 10.0" --filterName "FAIL3" --filterExpression "HaplotypeScore > 20.0" --filterName "FAIL4" --filterExpression "QUAL < 30.0 || DP < (288.06/2) || DP > (288.06*3)" --filterName "FAIL5" --filterExpression "MQ0 >= 4 && '((MQ0 / (1.0 * DP)) > 0.1)'" --filterName "FAIL6"
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T VariantFiltration -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -o merged.indels.marked.vcf -V merged.indels.raw.vcf --filterExpression "MLEAF < 0.95" --filterName "AFFilter" --filterExpression "MQ < 30.0" --filterName "MQFilter" --filterExpression "QD < 10.0" --filterName "QDFilter" --filterExpression "FS > 10.0" --filterName "FSFilter" --filterExpression "MQ0 >= 4 && '((MQ0 / (1.0 * DP)) > 0.1)'" --filterName "HARD_TO_VALIDATE" --filterExpression "QUAL < 30.0 || DP < (288.06/2) || DP > (288.06*3)" --filterName "QualFilter"
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/GenomeAnalysisTK.jar -T VariantFiltration -R /home/jhuang/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -o merged.indels.AMB.vcf -V merged.indels.raw.vcf --filterExpression "MLEAF < 0.95" --filterName "FAIL" --filterExpression "MQ < 30.0" --filterName "FAIL1" --filterExpression "QD < 10.0" --filterName "FAIL2" --filterExpression "FS > 10.0" --filterName "FAIL3" --filterExpression "MQ0 >= 4 && '((MQ0 / (1.0 * DP)) > 0.1)'" --filterName "FAIL4" --filterExpression "QUAL < 30.0 || DP < (288.06/2) || DP > (288.06*3)" --filterName "FAIL5"
filter_PASS.py merged.snps.marked.vcf > merged.snps.filtered.vcf
cat merged.snps.raw.vcf | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.snps.filtered.vcf
variant calling using freebayes + filtering
VCFFILTER="/home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter"
FREEBAYES="/usr/bin/freebayes"
freebayes --fasta-reference ~/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -b merged_samtools.bam -p 1 -i -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.snps_filtered_freebayes.vcf
freebayes --fasta-reference ~/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -b merged_samtools.bam -p 1 -I -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.indels_filtered_freebayes.vcf
--targets sorted.bed
without filtering
freebayes --fasta-reference ~/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -b merged_samtools.bam -p 1 -i -X -u > merged.snps_freebayes.vcf
freebayes --fasta-reference ~/DATA/Data_Anna4_SNP/Raw_Data_S_epidermidis_va8169/trimmed_CROP300_REF/GCF_000007645.1_ASM764v1_genomic.fasta -b merged_samtools.bam -p 1 -I -X -u > merged.indels_freebayes.vcf
merging the results from GATK and freebays
filter_out.annotated.vcf.py merged.snps.filtered.vcf > merged.snps.filtered_.vcf
filter_out.annotated.vcf.py merged.snps_filtered_freebayes.vcf > merged.snps_filtered_freebayes_.vcf
cat merged.snps.filtered_.vcf merged.snps_filtered_freebayes_.vcf > merged.snps.filtered.both.vcf
sort -t$' ' -k1,1 -g merged.snps.filtered.both.vcf > merged.snps.filtered.both_.vcf
Todo: using snpEff anntating the merged results
Merging
cat merged.snps.filtered_.vcf merged.snps_filtered_freebayes_.vcf > merged.snps.filtered.both.vcf
sort -t$' ' -k1,1 -g merged.snps.filtered.both.vcf > merged.snps.filtered.both_.vcf
common_snps.py
annotation merged.snps.filtered.both_.vcf
add genome to /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.config
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -formatEff -v Klebsiella_pneumoniae_NTUH_K2044_uid59073 merged.snps.filtered_.vcf > merged.snps.filtered.both.annotated.vcf
/usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -formatEff -v Enterococcus_faecalis_V583_uid57669 merged.snps.filtered.common.vcf > merged.snps.filtered.common.annotated.vcf
! /usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -no-downstream -no-intergenic -ud 100 -formatEff -v GCA_000007645.1.22 merged.snps.filtered.both_.vcf > merged.snps.filtered.both.annotated.vcf
! /usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -formatEff -v GCA_000007645.1.22 merged.snps.filtered.both_.vcf > merged.snps.filtered.both.annotated.vcf
In /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/annotated
Running: /usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -no-downstream -no-intergenic -ud 100 -formatEff -v GCA_000007645.1.22 /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/out/out.vcf > /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/annotated/out.annotated.vcf
In /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/annotated
Running: /usr/bin/java -jar -XX:+UseSerialGC -Xmx4G /home/jhuang/Tools/SPANDx_v3.1/snpEff/snpEff.jar eff -no-downstream -no-intergenic -ud 100 -formatEff -v GCA_000007645.1.22 /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/out/out_indels.vcf > /home/jhuang/Fastqs/S_epidermidis_vc/trimmed_CROP300_REF/Phylo/annotated/out_indels.annotated.vcf
Todo: writing scripts to reformat the result of snpEff
rm ./Outputs/Comparative/All_SNPs_annotated.txt
mv ./Phylo/out/out.vcf ./Phylo/out/out_.vcf
cp ./Phylo/bams/merged.snps.filtered.vcf ./Phylo/out/out.vcf
check Reihenfolge of sample in out.vcf and out_.vcf
11-va35329 5-va8169-wt 6-va8169-scv
sample_11-va35329 sample_5-va8169-wt sample_6-va8169-scv
merged.indels_filtered_freebayes.vcf
./Phylo/out/out.vcf
./Phylo/indels/out/out.vcf
--
```