gene_x 0 like s 307 view s
Tags: processing, pipeline
run tpp https://transit.readthedocs.io/en/latest/transit_running.html
#-maxreads 10000 or not_given for take all!
#-primer AGATGTGTATAAGAGACAG the default primer of Tn5 is TAAGAGACAG!
#-primer-start-window 0,159 set 0,159 as default!
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10_chimera -mismatches initial_mutants_rep1 -bwa-alg mem -primer ACCTACCCCNCCGCTCTC -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg initial_mutants_rep1.tpp.cfg
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10 -mismatches initial_mutants_rep1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg initial_mutants_rep1.tpp.cfg
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R2_001.fastq.gz -output LB_culture_rep1 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg LB_culture_rep1.tpp.cfg
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R2_001.fastq.gz -output intracellular_mutants_24h_rep1 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg intracellular_mutants_24h_rep1.tpp.cfg
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R2_001.fastq.gz -output extracellular_mutants_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg extracellular_mutants_24h_rep2.tpp.cfg
python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R2_001.fastq.gz -output growthout_control_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
mv tpp.cfg growthout_control_24h_rep2.tpp.cfg
#END
cp initial_mutants_rep1.tn_stats initial_mutants_rep1.tn_stats_
#Delete all general statistics before the table data in initial_mutants_rep1.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
python3 ../parse_tn_stats.py initial_mutants_rep1.tn_stats_ initial_mutants_rep1.tn_stats.xlsx
python3 ../parse_tn_stats.py LB_culture_rep1.tn_stats_ LB_culture_rep1.tn_stats.xlsx
#calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)" 441057 and 276060
mkdir initial_mutants_rep1_wig
mv *.wig initial_mutants_rep1_wig/
zip -r initial_mutants_rep1_wig.zip initial_mutants_rep1_wig/
#The counts-files are too big, not nessasary to send:
#~/Tools/csv2xls-0.4/csv_to_xls.py initial_mutants_rep1.tn_stats_ *.counts -d$',' -o initial_mutants_rep1.stats.xls;
# contig_1_10 699 411
# contig_1_8 3206 2031
# contig_1_7 3787 2376
# contig_1_6 4604 2871
# contig_1_5 2794 1765
# contig_1_4 83 58
# contig_1_3 2944 1882
# contig_1_2 15446 9678
# contig_1_1 14391 8954
delete PCR-duplicate for checking if the template counts in tpp are correct
To address the issue of PCR duplicates in paired-end sequencing data (which you referred to as "PCA-duplicate," but I believe you meant "PCR duplicate"), you can use tools designed for post-alignment processing of SAM/BAM files. These tools identify and remove or mark duplicates where both reads of a pair are identical to another pair in the library, often indicating that they are duplicates resulting from PCR amplification rather than unique sequencing events.
Step 0. Install Samtools and Picard Tools for Removing PCR Duplicates: Both Samtools and Picard are widely used for manipulating SAM/BAM files, including removing duplicates. Here's how you can use Picard to remove PCR duplicates:
conda install -c bioconda picard samtools
Step 1: Convert SAM to BAM and sort the BAM file
If your file is in SAM format, you need to convert it to BAM format first using Samtools; PCR duplicate removal requires that the BAM file be sorted by coordinate. This can be done using Samtools:
samtools view -Sb initial_mutants_rep1.sam > initial_mutants_rep1.bam
samtools sort initial_mutants_rep1.bam -o initial_mutants_rep1_sorted.bam
#557566 + 0 read1
#557566 + 0 read2
samtools index initial_mutants_rep1_sorted.bam
# contig_1_1 14391 8954
#@SQ SN:gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1 LN:84292
# Extract reads from contig_1_1
samtools view -b initial_mutants_rep1_sorted.bam "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1" > contig_1_1.bam
# Run flagstat on the filtered BAM file
samtools flagstat contig_1_1.bam
#16589 + 0 read1
#16579 + 0 read2
Step 3: Mark or Remove Duplicates Using Picard
Picard Tools can be used to mark duplicates. Here, I'll show you how to remove them:
# -Xmx4g between java and -jar
# java -jar /usr/local/bin/picard.jar
picard MarkDuplicates \
I=contig_1_1.bam \
O=contig_1_1_no_duplicates.bam \
M=marked_dup_metrics.txt \
REMOVE_DUPLICATES=true
# ## METRICS CLASS picard.sam.DuplicationMetrics
#LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE
#Unknown Library 111 16473 81 111 90 6114 0 0.372629 16270
samtools flagstat contig_1_1_no_duplicates.bam
#10405 + 0 read1
#10445 + 0 read2
This command marks duplicates (PCR and optical) and removes them, outputting a file without duplicates. The M option specifies where to write metrics about the duplicates.
Step 3 (Alternatively). Using samtools to Remove Duplicates Directly
Alternatively, if you prefer a simple and fast tool, samtools has a rmdup utility, but it's less versatile compared to Picard:
samtools rmdup -S input_sorted.bam output_no_duplicates.bam
Note that samtools rmdup is deprecated in recent versions of samtools because it does not handle all edge cases as well as Picard.
Considerations
Read Alignment: Ensure that your reads are aligned correctly and that the SAM/BAM files are error-free before duplicate removal.
Optical vs. PCR Duplicates: Picard differentiates between PCR duplicates (originating from the same fragment of DNA) and optical duplicates (artifacts from the sequencing platform). Ensure that this distinction is clear if relevant for your analysis.
Quality Checks: After removing duplicates, it's wise to perform quality control checks to understand how much of your data was affected by duplicates.
Check how the failed trimmed reads not contain the the transposon:genomic boundary TAAGAGACAG
#-3.1- find read sequences
>VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG
ACCTACCCCNCCGCTCTCATCAACCCAATAACGCAGGCAATCAAGCACCCACTGCATCACATAAGGTTGGCTAAGGCGCAATGTATTGCCACAACCGGTCATGTTGTCATATTCACCCTCAGAGGTGAGCCAGTAGTAGCTGGCATTGTCGATCCCACG
>VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG
CAAAGAGGGGGGCGAAAAGATTTTAAACGATCTTGGCGAAATGAATTTTGAGTTTGTCGTGTGTGACTCACCTGCCGGTATCGAAAGCGGTGCGTTGATGGCACTGTATTTTGCTGACG
>VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG
TCTTTACCGGGGATGGGACGCAAGATCTGCGCGCACTGGAACCGGCTTATGTTTCCTCCGTTGACAGTGGGAATCTGGCTGGGCATTTGATTGTACTGGCCAATACCTGTGAAGAGTGGGCCGCAGAACCCTTAGCGGCCAACGGGGCCAAGGGATTG
>VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG
CACCTACCCCCCAGCTCTCATCAACCACAATTGACGCAACATCAGCTGGCGCCATGGCATTAATAACAAACTGGATGAATGGCCCCTGGGATCAACAGCACGAACAAACCGCCGTGTGGTGCCATCAGTTGCGCGCCGAAGGCCATTGACAGAGCACC
>VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG
CCTACACCCCCGCTCTCATCAACCGCAGCAAAGAAAACGAAAATAATGAAGTCATAAAACTCAAGCGCTCCGCCTAAAGCCGCCAGAGTGAGGGTTTTATAATCTTGCTTATTCAGCCGACGGTTATGAT
>VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG
CCTACACCCCAGCTCTCATCAACCAATAACCAGTCAACATCACTGACATCATGTTGCTGGCAATATTCCAGCAGTGCATCTGGCGCCCAAAAACGCGCGCGATCCCCTGCTCGCTGACTTAATACTTGAGTGGCGGCCTTGCCCGGCCCAGTAACCCA
>VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG
ACCTACCACCAAGCTCTCATCAACCTGAATGGATTGAGGGCTACGCTCCCAGATCTGTTCTGCCAATTGGCGCAGATGGTTATCGGTGCAATACAAATGCACAAATGACAAATCGTCTCTTACATCTCAGCTCGATAAACTGCCCTTTGGCATAATGC
>VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG
GTTGGCGTCCGGCATCTGCATATCACTCATAAAGCGCCTCGATAATCCCACGGATATCCTGGTCGCTAGCTGGGCGCGGGTTGGTGCGCAATGTGAGCTCGGCCAGGGCTGCAGCAATCATATCGGGCAGATGTTGCTGCAATTGGCGCTCATTAACTT
>VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG
GCACTGGAAGAGCCGACTAGCCTCAATACTCTTGAACTGCTACCGGAATTATTTGCCGCCAATATTGCCTCGGTGAAAATTGAAGGGCGTCAGCGCAGCCCGGCTTATGTCAGCCAGGTGGCGAAAGTGTGGCGGCAGGCAATTGACCGCTATCTGGC
>VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG
TAAATAGGCCAGACTTGAAATCACACGATCCGGCCAGCGATT
#-3.2- generate the genomic sequences
blastn -db ../WA-314_m.fna -query initial_mutants_rep1.trimmed1_failed_trim_n20 -out failed_trim_n20_on_yersinia.blastn -evalue 10000 -num_threads 124 -outfmt 6 -max_target_seqs 1
VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51 100.000 141 0 0 19 159 20552 20692 1.20e-70 261
VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38 98.319 119 2 0 1 119 43674 43556 3.14e-55 209
VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12 98.101 158 3 0 1 158 59515 59672 4.26e-75 276
VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2 100.000 57 0 0 18 74 78361 78417 5.91e-24 106
VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1 100.000 110 0 0 21 130 78749 78640 1.62e-53 204
VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23 99.329 149 0 1 10 158 54935 55082 7.13e-73 268
VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3 99.301 143 1 0 16 158 46439 46581 4.29e-70 259
VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25 96.226 159 6 0 1 159 36032 36190 1.20e-70 261
VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2 100.000 158 0 0 1 158 54938 55095 4.23e-80 292
VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31 97.619 42 1 0 1 42 25535 25576 1.04e-14 73.1
samtools faidx WA-314_m.fna "gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51":20534-20692 > genome_rg_for_read1.fasta
samtools faidx WA-314_m.fna "gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38":43556-43674 > genome_rg_for_read2.fasta
samtools faidx WA-314_m.fna "gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12":59515-59672 > genome_rg_for_read3.fasta
samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":78344-78501 > genome_rg_for_read4.fasta
samtools faidx WA-314_m.fna "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1":78640-78769 > genome_rg_for_read5.fasta
samtools faidx WA-314_m.fna "gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23":54926-55082 > genome_rg_for_read6.fasta
samtools faidx WA-314_m.fna "gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3":46424-46581 > genome_rg_for_read7.fasta
samtools faidx WA-314_m.fna "gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25":36032-36190 > genome_rg_for_read8.fasta
samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":54938-55095 > genome_rg_for_read9.fasta
samtools faidx WA-314_m.fna "gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31":25535-25576 > genome_rg_for_read10.fasta
#-3.3- transposon amplicon start sequence
>Transposon_ampli_start
ACCTACAACAAAGCTCTCATCAAC CGTGGCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTCAGGGTTGAGATGTGTA TAAGAGACAG
#-3.4- generate 10 failed_${read}_group.fasta and multiple align them
for read in read1 read2 read3 read4 read5 read6 read7 read8 read9 read10; do
cat failed_${read}.fasta genome_rg_for_${read}.fasta transposon_ampli_start.fasta > failed_${read}_group.fasta
mafft --adjustdirection --clustalout failed_${read}_group.fasta > failed_${read}_group.aln
done
cat failed_read1_group.aln failed_read2_group.aln failed_read3_group.aln failed_read4_group.aln failed_read5_group.aln failed_read6_group.aln failed_read7_group.aln failed_read8_group.aln failed_read9_group.aln failed_read10_group.aln > failed_reads_group.aln
#-3.5- show the results in checking_failed_reads.pdf
Source code of parse_tn_stats.py used in the point 1.
import argparse
import pandas as pd
def parse_tn_stats_detailed(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
print("Number of lines read:", len(lines))
if lines:
print("First few lines:", lines[:10])
metrics_df = pd.DataFrame()
current_metric = None
metric_data = {}
for line in lines:
if line.startswith('# ') and not line.startswith('# '): # Ensure it's not a contig data line # Metric definition
if current_metric and metric_data:
# Save data before starting new metric
for contig, value in metric_data.items():
metrics_df.at[contig, current_metric] = value
metric_data = {}
# Extract the metric name
current_metric = line.split(':')[0].strip('# ')
print("Processing new metric:", current_metric) # Debug output
else: # Data lines for contigs under the current metric
parts = line.strip().split(':')
contig = parts[0].strip()
value = parts[1].strip()
try:
value = float(value) if value.lower() != 'nan' else pd.NA
except ValueError:
value = pd.NA # In case of conversion failure
metric_data[contig] = value
# Capture the last metric data after finishing all lines
if current_metric and metric_data:
for contig, value in metric_data.items():
metrics_df.at[contig, current_metric] = value
metrics_df.fillna(value=pd.NA, inplace=True)
return metrics_df
def main():
parser = argparse.ArgumentParser(description="Parse detailed TN stats from a file and output to Excel.")
parser.add_argument("input_file", help="Input file path for TN stats.")
parser.add_argument("output_file", help="Output Excel file path.")
args = parser.parse_args()
# Parse the detailed metrics from the provided file path
detailed_metrics_df = parse_tn_stats_detailed(args.input_file)
# Save the DataFrame to an Excel file
with pd.ExcelWriter(args.output_file, engine='openpyxl') as writer:
detailed_metrics_df.to_excel(writer, sheet_name='Detailed Metrics')
print(f"Final DataFrame saved to Excel: {args.output_file}")
if __name__ == "__main__":
main()
点赞本文的读者
还没有人对此文章表态
没有评论
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum