gene_x 0 like s 327 view s
Tags: variation, pipeline
mv snippy_HDRNA_01 snippy_CP133676
mv snippy_HDRNA_03 snippy_CP133677
cp -r snippy_HDRNA_06 snippy_CP133678
mv snippy_HDRNA_06 snippy_CP133679
cp -r snippy_HDRNA_07 snippy_CP133680
mv snippy_HDRNA_07 snippy_CP133681
cp -r snippy_HDRNA_08 snippy_CP133682
mv snippy_HDRNA_08 snippy_CP133683
cp -r snippy_HDRNA_12 snippy_CP133684
cp -r snippy_HDRNA_12 snippy_CP133685
cp -r snippy_HDRNA_12 snippy_CP133686
mv snippy_HDRNA_12 snippy_CP133687
cp -r snippy_HDRNA_16 snippy_CP133688
cp -r snippy_HDRNA_16 snippy_CP133689
cp -r snippy_HDRNA_16 snippy_CP133690
cp -r snippy_HDRNA_16 snippy_CP133691
mv snippy_HDRNA_16 snippy_CP133692
cp -r snippy_HDRNA_17 snippy_CP133693
cp -r snippy_HDRNA_17 snippy_CP133694
mv snippy_HDRNA_17 snippy_CP133695
cp -r snippy_HDRNA_19 snippy_CP133696
cp -r snippy_HDRNA_19 snippy_CP133697
cp -r snippy_HDRNA_19 snippy_CP133698
mv snippy_HDRNA_19 snippy_CP133699
cp -r snippy_HDRNA_20 snippy_CP133700
mv snippy_HDRNA_20 snippy_CP133701
#git clone https://github.com/huang/bacto
#mv bacto/* ./
#rm -rf bacto
for sample in snippy_CP133678 snippy_CP133679 snippy_CP133680 snippy_CP133681 snippy_CP133682 snippy_CP133683 snippy_CP133684 snippy_CP133685 snippy_CP133686 snippy_CP133687 snippy_CP133688 snippy_CP133689 snippy_CP133690 snippy_CP133691 snippy_CP133692 snippy_CP133693 snippy_CP133694 snippy_CP133695 snippy_CP133696 snippy_CP133697 snippy_CP133698 snippy_CP133699 snippy_CP133700 snippy_CP133701; do
cd ${sample};
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
cd ..;
done
#prepare raw_data and bacto-0.1.json
#set "reference": "db/CP133***.gb" in bacto-0.1.json
conda activate bengal3_ac3
#cd snippy_CP133676
#/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
#cd snippy_CP133677
#/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd snippy_CP133678
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133679
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133680
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133681
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133682
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd snippy_CP133683
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133684
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133685
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133686
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133687
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133688
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133689
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133690
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133691
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133692
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133693
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133694
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133695
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133696
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133697
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133698
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133699
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133700
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ../snippy_CP133701
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
cd ..
2, using spandx calling variants (almost the same results to the one from viral-ngs!)
mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
cp PP810610.1.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
/home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 -d
gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
(spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume
3, summarize all SNPs and Indels from the snippy result directory.
#Output: snippy_CP133676/snippy/summary_snps_indels.csv
python3 summarize_snippy_res.py snippy_CP133676/snippy
#-- post-processing of Outputs_CP133676 produced by SPANDx (temporary not necessary) --
#cd Outputs_CP133676/Phylogeny_and_annotation
#awk '{if($3!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
#cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
#grep -v "/" f1_7 > f1_7_
#grep -v "\." f1_7_ > f1_7__
#grep -v "*" f1_7__ > f1_7___
#grep -v "INDEL" f1_7___ > f1_7_SNPs
#cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
#grep -wFf <(awk 'NR>0 {print $1}' f1_7_SNPs_f2) All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
#grep -v "SNP" f1_7___ > f1_7_indels
#cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
#grep -wFf <(awk 'NR>0 {print $1}' f1_7_indels_f2) All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
4, merge the following two files summary_snps_indels.csv (192) and All_SNPs_indels_annotated.txt (248) --> merged_variants_CP133676.csv (94)
python3 merge_snps_indels.py snippy_CP133676/snippy/summary_snps_indels.csv Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt merged_variants_CP133676.csv
#check if the number of the output file is correct?
comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq) | wc -l
comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq)
5, (optional, since it should not happed) filter rows without change in snippy_CP133676/snippy/summary_snps_indels.csv (94)
awk -F, '
NR == 1 {print; next} # Print the header line
{
ref = $3; # Reference base (assuming REF is in the 3rd column)
same = 1; # Flag to check if all bases are the same as reference
samples = ""; # Initialize variable to hold sample data
for (i = 6; i <= NF - 8; i++) { # Loop through all sample columns, adjusting for the number of fixed columns
samples = samples $i " "; # Collect sample data
if ($i != ref) {
same = 0;
}
}
if (!same) {
print $0; # Print the entire line if not all bases are the same as the reference
#print "Samples: " samples; # Print all sample data for checking
}
}
' merged_variants_CP133676.csv > merged_variants_CP133676_.csv
#Explanation:
# -F, sets the field separator to a comma.
# NR == 1 {print; next} prints the header line and skips to the next line.
# ref = $3 sets the reference base (assumed to be in the 3rd column).
# same = 1 initializes a flag to check if all sample bases are the same as the reference.
# samples = ""; initializes a string to collect sample data.
# for (i = 6; i <= NF - 8; i++) loops through the sample columns. This assumes the first 5 columns are fixed (CHROM, POS, REF, ALT, TYPE), and the last 8 columns are fixed (Effect, Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length, Gene_name, Biotype).
# samples = samples $i " "; collects sample data.
# if ($i != ref) { same = 0; } checks if any sample base is different from the reference base. If found, it sets same to 0.
# if (!same) { print $0; print "Samples: " samples; } prints the entire line and the sample data if not all sample bases are the same as the reference.
6, improve the header
sed -i '1s/_trimmed_P//g' merged_variants_CP133676.csv
7, check the REF and K1 have the same base and delete those records with difference.
cut -f3 -d',' merged_variants_CP133676.csv > f3
cut -f6 -d',' merged_variants_CP133676.csv > f6
diff f3 f6
awk -F, '$3 == $6 || NR==1' merged_variants_CP133676.csv > filtered_merged_variants_CP133676.csv #(93)
cut -f3 -d',' filtered_merged_variants_CP133676.csv > f3
cut -f6 -d',' filtered_merged_variants_CP133676.csv > f6
diff f3 f6
8, MANUALLY REMOVE the column f6 in filtered_merged_variants_CP133676.csv, and rename CHROM to HDRNA_01_K01 in the header, summarize chr and plasmids SNPs of a sample together to a single list, save as an Excel-file.
9, code of summarize_snippy_res.py
import pandas as pd
import glob
import argparse
import os
#python3 summarize_snps_indels.py snippy_HDRNA_01/snippy
#The following record for 2365295 is wrong, since I am sure in the HDRNA_01_K010, it should be a 'G', since in HDRNA_01_K010.csv
#CP133676,2365295,snp,A,G,G:178 A:0
#
#The current output is as follows:
#CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,None,,,,,,None,None
#CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan
#grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
#BUG: CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan
import pandas as pd
import glob
import argparse
import os
def main(base_directory):
# List of isolate identifiers
isolates = [f"HDRNA_01_K0{i}" for i in range(1, 11)]
expected_columns = ["CHROM", "POS", "REF", "ALT", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"]
# Find all CSV files in the directory and its subdirectories
csv_files = glob.glob(os.path.join(base_directory, '**', '*.csv'), recursive=True)
# Create an empty DataFrame to store the summary
summary_df = pd.DataFrame()
# Iterate over each CSV file
for file_path in csv_files:
# Extract isolate identifier from the file name
isolate = os.path.basename(file_path).replace('.csv', '')
df = pd.read_csv(file_path)
# Ensure all expected columns are present, adding missing ones as empty columns
for col in expected_columns:
if col not in df.columns:
df[col] = None
# Extract relevant columns
df = df[expected_columns]
# Ensure consistent data types
df = df.astype({"CHROM": str, "POS": int, "REF": str, "ALT": str, "TYPE": str, "EFFECT": str, "LOCUS_TAG": str, "GENE": str, "PRODUCT": str})
# Add the isolate column with the ALT allele
df[isolate] = df["ALT"]
# Drop columns that are not needed for the summary
df = df.drop(["ALT"], axis=1)
if summary_df.empty:
summary_df = df
else:
summary_df = pd.merge(summary_df, df, on=["CHROM", "POS", "REF", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"], how="outer")
# Fill missing values with the REF allele for each isolate column
for isolate in isolates:
if isolate in summary_df.columns:
summary_df[isolate] = summary_df[isolate].fillna(summary_df["REF"])
else:
summary_df[isolate] = summary_df["REF"]
# Rename columns to match the required format
summary_df = summary_df.rename(columns={
"CHROM": "CHROM",
"POS": "POS",
"REF": "REF",
"TYPE": "TYPE",
"EFFECT": "Effect",
"LOCUS_TAG": "Gene_name",
"GENE": "Biotype",
"PRODUCT": "Product"
})
# Replace any remaining None or NaN values in the non-isolate columns with empty strings
summary_df = summary_df.fillna("")
# Add empty columns for Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length
summary_df["Impact"] = ""
summary_df["Functional_Class"] = ""
summary_df["Codon_change"] = ""
summary_df["Protein_and_nucleotide_change"] = ""
summary_df["Amino_Acid_Length"] = ""
# Reorder columns
cols = ["CHROM", "POS", "REF", "TYPE"] + isolates + ["Effect", "Impact", "Functional_Class", "Codon_change", "Protein_and_nucleotide_change", "Amino_Acid_Length", "Gene_name", "Biotype"]
summary_df = summary_df.reindex(columns=cols)
# Remove duplicate rows
summary_df = summary_df.drop_duplicates()
# Save the summary DataFrame to a CSV file
output_file = os.path.join(base_directory, "summary_snps_indels.csv")
summary_df.to_csv(output_file, index=False)
print("Summary CSV file created successfully at:", output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Summarize SNPs and Indels from CSV files.")
parser.add_argument("directory", type=str, help="Base directory containing the CSV files in subdirectories.")
args = parser.parse_args()
main(args.directory)
10, code of merge_snps_indels.py
import pandas as pd
import argparse
import os
def merge_files(input_file1, input_file2, output_file):
# Read the input files
df1 = pd.read_csv(input_file1)
df2 = pd.read_csv(input_file2, sep='\t')
# Merge the dataframes on the 'POS' column, keeping only the rows that have common 'POS' values
merged_df = pd.merge(df2, df1[['POS']], on='POS', how='inner')
# Remove duplicate rows
merged_df.drop_duplicates(inplace=True)
# Save the merged dataframe to the output file
merged_df.to_csv(output_file, index=False)
print("Merged file created successfully at:", output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Merge two SNP and Indel files based on the 'POS' column.")
parser.add_argument("input_file1", type=str, help="Path to the first input file (summary_snps_indels.csv).")
parser.add_argument("input_file2", type=str, help="Path to the second input file (All_SNPs_indels_annotated.txt).")
parser.add_argument("output_file", type=str, help="Path to the output file.")
args = parser.parse_args()
merge_files(args.input_file1, args.input_file2, args.output_file)
点赞本文的读者
还没有人对此文章表态
没有评论
Scaffolding and finishing an assembly with a reference genome
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
© 2023 XGenes.com Impressum