GO terms for S. epidermidis

gene_x 0 like s 278 view s

Tags: python, pipeline

  1. download go terms from Gene Ontology https://geneontology.org/docs/download-ontology/

    import obonet
    
    # Load the OBO file
    url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
    graph = obonet.read_obo(url)
    
    # Open a file to write to
    with open('go_terms.csv', 'w') as f:
        f.write("id,name,namespace\n")  # Write header
        for node_id, data in graph.nodes(data=True):
            name = data.get('name', '')
            namespace = data.get('namespace', '')
            f.write(f"{node_id}\t{name}\t{namespace}\n")  # Write data
    
  2. prepare the blastp output

    python3 filter_uniref_by_taxid.py uniref100.fasta uniref100_taxid1282.fasta
    
    makeblastdb -in uniref90_taxid1282.fasta -dbtype prot
    makeblastdb -in uniref50_taxid1282.fasta -dbtype prot;
    makeblastdb -in uniref100_taxid1282.fasta -dbtype prot;
    
    # -strand both -max_target_seqs 1 -taxidlist taxid_list.txt
    blastp -query 1585_CDS.fasta -db /home/jhuang/REFs/uniref100_taxid1282.fasta -out 1585_CDS_on_uniref100.blastp -num_threads 100 -outfmt 6 -evalue 1.0e-30
    grep -v "UniRef100_UPI" 1585_CDS_on_uniref100.blastp > 1585_CDS_on_uniref100_.blastp
    
    python3 add_RefSeq-ID.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_RefSeq-ID.csv
    python3 add_UniProt-ID.py #Input are 1585_CDS_on_uniref100_.blastp and 1585_genes_annotated_with_GO_RefSeq-ID.csv; Output is 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID.csv.
    python3 add_Translation.py 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID.csv 1585.csv 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID_Translation.csv
    
    # check if they are running correctly
    cut -f1 -d',' 1585_genes_annotated_with_GO_NCBI-ID.csv > temp1
    cut -f1 -d',' 1585_genes_annotated_with_GO_NCBI-ID_UniProt-ID_Translation.csv > temp2
    diff temp1 temp2
    
  3. code of filter_uniref_by_taxid.py

    import sys
    
    def filter_uniref50(input_file, output_file):
    with open(input_file, 'r') as in_handle:
            with open(output_file, 'w') as out_handle:
            record_started = False
            for line in in_handle:
                    if line.startswith(">"):
                    if "TaxID=1282 " in line:
                            record_started = True
                            out_handle.write(line)
                    else:
                            record_started = False
                    elif record_started:
                    out_handle.write(line)
    
    if __name__ == "__main__":
    # Check command-line arguments
    if len(sys.argv) != 3:
            print("Usage: python3 filter_uniref_by_taxid.py <input_file> <output_file>")
            sys.exit(1)
    
    # Get input and output file paths from command-line arguments
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    
    # Filter UniRef50 database based on TaxID=1282
    filter_uniref50(input_file, output_file)
    
    print("Filtered UniRef50 database saved to:", output_file)
    
  4. code of add_RefSeq-ID.py

    import pandas as pd
    import sys
    
    def main(input_file1, input_file2, output_file):
    # Load the first table
    table1 = pd.read_csv(input_file1)
    
    # Load the second table
    table2 = pd.read_csv(input_file2)
    
    # Merge the tables on SeqName from table1 and Locus Tag from table2
    merged_table = pd.merge(table1, table2[['Locus Tag', 'inference']], left_on='SeqName', right_on='Locus Tag', how='left')
    
    # Drop the extra 'Locus Tag' column that is redundant after merge
    merged_table.drop(columns=['Locus Tag'], inplace=True)
    
    # Save the merged table to a new CSV file
    merged_table.to_csv(output_file, index=False)
    
    if __name__ == "__main__":
    if len(sys.argv) != 4:
            print("Usage: python script.py <input_file1> <input_file2> <output_file>")
            #python3 add_NCBI-ID.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_NCBI-ID.csv
    else:
            input_file1 = sys.argv[1]
            input_file2 = sys.argv[2]
            output_file = sys.argv[3]
            main(input_file1, input_file2, output_file)
    
  5. code of add_UniProt-ID.py

    import csv
    
    # Read blastp results and create a dictionary mapping CDS IDs to lists of UniRef100 IDs
    def read_blastp_results(blastp_file):
    cds_to_uniref100 = {}
    with open(blastp_file, 'r') as blastp_handle:
            blastp_reader = csv.reader(blastp_handle, delimiter='\t')
            for row in blastp_reader:
            cds_id, uniref100_id = row[0], row[1]
            if cds_id not in cds_to_uniref100:
                    cds_to_uniref100[cds_id] = []
            cds_to_uniref100[cds_id].append(uniref100_id)
    return cds_to_uniref100
    
    # Add UniRef100 IDs as the last column in the input CSV file
    def add_uniref100_column(input_file, output_file, cds_to_uniref100):
    with open(input_file, 'r') as input_handle:
            with open(output_file, 'w', newline='') as output_handle:
            reader = csv.reader(input_handle)
            writer = csv.writer(output_handle)
            for row in reader:
                    cds_id = row[0]
                    if cds_id in cds_to_uniref100:
                    uniref100_ids = '|'.join(cds_to_uniref100[cds_id])
                    row.append(uniref100_ids)
                    else:
                    row.append("")  # If no UniRef100 IDs found, add empty string
                    writer.writerow(row)
    
    if __name__ == "__main__":
    blastp_file = "1585_CDS_on_uniref100_.blastp"  # Replace with your blastp results file
    input_file = "1585_genes_annotated_with_GO_NCBI-ID.csv"
    output_file = "1585_genes_annotated_with_GO_NCBI-ID_UniProt-ID.csv"
    
    # Read blastp results and create dictionary mapping CDS IDs to lists of UniRef100 IDs
    cds_to_uniref100 = read_blastp_results(blastp_file)
    
    # Add UniRef100 IDs as the last column in the input CSV file
    add_uniref100_column(input_file, output_file, cds_to_uniref100)
    
    print("UniRef100 IDs added to the input CSV file:", output_file)
    
  6. code of add_Translation.py

    import pandas as pd
    import sys
    
    def main(input_file1, input_file2, output_file):
    # Load the first table
    table1 = pd.read_csv(input_file1)
    
    # Load the second table
    table2 = pd.read_csv(input_file2)
    
    # Merge the tables on SeqName from table1 and Locus Tag from table2
    merged_table = pd.merge(table1, table2[['Locus Tag', 'Translation']], left_on='SeqName', right_on='Locus Tag', how='left')
    
    # Drop the extra 'Locus Tag' column that is redundant after merge
    merged_table.drop(columns=['Locus Tag'], inplace=True)
    
    # Save the merged table to a new CSV file
    merged_table.to_csv(output_file, index=False)
    
    if __name__ == "__main__":
    if len(sys.argv) != 4:
            print("Usage: python script.py <input_file1> <input_file2> <output_file>")
            #python3 add_NCBI-ID_to_1585-table.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_Translation.csv
    else:
            input_file1 = sys.argv[1]
            input_file2 = sys.argv[2]
            output_file = sys.argv[3]
            main(input_file1, input_file2, output_file)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum