gene_x 0 like s 587 view s
Tags: genes, processing
-- 0, generate the ref.gtf and ref.bed from GenBank file: HD21-2_new.gtf and HD21-2_new.bed --
#[PREPARE gtf from gff] https://github.com/gpertea/gffread/issues/3
#DEL gb2gff.py HD21-2_chr.gbk > HD21-2.gtf
#DEL gb2gff.py HD21-2_plasmid_pHD21-2_1.gbk >> HD21-2.gtf
#DEL gb2gff.py HD21-2_plasmid_pHD21-2_2.gbk >> HD21-2.gtf
#-- STEP 0.0: download the gff3 files --
cat HD21-2_chr.gff3 HD21-2_plasmid1.gff3 HD21-2_plasmid2.gff3 > HD21-2.gff3
#-- STEP 0.1: rename CDS to exon! --
gffread -E -F -O -T HD21-2.gff3 -o HD21-2.gtf
sed -i -e "s/\tCDS\t/\texon\t/g" HD21-2.gtf
#Genbank_CDS --> Genbank_exon
#cmsearch_CDS --> cmsearch_exon
#tRNAscan-SE_CDS --> tRNAscan-SE_exon
#or xxx_CDS --> xxx_exon --> xxx[cmsearch, Genbank, tRNAscan] exon
#
#-- STEP 0.2: add protein_coding "xxxx" to type exon
#exon (former Genbank CDS): 2657 = (Genbank transcript + cmsearch transcript + tRNAscan-SE transcript) = (2586+61+10)
#Genbank region: 7
# transcript: 2649
#Genbank_exon == Genbank transcript 2586
#tRNAscan-SE_exon == tRNAscan-SE transcript 61
#cmsearch_exon == cmsearch transcript 10
------------------------------------------------------
#total 5313 (transcript 2649-->bed, exon 2657, region 7)
#in kate "\texon" --> "_exon"
grep "Genbank_exon" HD21-2.gtf > HD21-2_Genbank_exon.gtf #add gene_biotype "protein_coding"; at the end of line (2586)
grep "tRNAscan-SE_exon" HD21-2.gtf > HD21-2_tRNAscan-SE_exon.gtf #add gene_biotype "tRNA"; at the end of line (61)
grep "cmsearch_exon" HD21-2.gtf > HD21-2_cmsearch_exon.gtf #add gene_biotype "RNase_P_RNA";| gene_biotype "ncRNA";| gene_biotype "rRNA";| gene_biotype "tmRNA";| gene_biotype "SRP_RNA"; at the end (10)
grep -i -e "cmsearch" HD21-2.gtf > temp
grep -i -e "gene_biotype" temp > temp2
CP052994.1 cmsearch transcript 1286239 1286622 . - . transcript_id "rna-HKH70_06005"; gene_id "gene-HKH70_06005"; gene_name "rnpB"; Dbxref "RFAM:RF00011"; gbkey "ncRNA"; gene "rnpB"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_06005"; product "RNase P RNA component class B"; Name "rnpB"; gene_biotype "RNase_P_RNA";
CP052994.1 cmsearch transcript 1452715 1452905 . - . transcript_id "rna-HKH70_06865"; gene_id "gene-HKH70_06865"; gene_name "ssrS"; Dbxref "RFAM:RF00013"; gbkey "ncRNA"; gene "ssrS"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_06865"; product "6S RNA"; Name "ssrS"; gene_biotype "ncRNA";
CP052994.1 cmsearch transcript 1706296 1706410 . - . transcript_id "rna-HKH70_08190"; gene_id "gene-HKH70_08190"; gene_name "rrf"; Dbxref "RFAM:RF00001"; gbkey "rRNA"; gene "rrf"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_08190"; product "5S ribosomal RNA"; Name "rrf"; gene_biotype "rRNA";
...
grep -v "_exon" HD21-2.gtf > HD21-2_non_exon.gtf #2656
cat HD21-2_non_exon.gtf HD21-2_Genbank_exon.gtf HD21-2_tRNAscan-SE_exon.gtf HD21-2_cmsearch_exon.gtf > HD21-2_new.gtf #5313
#in kate "exon" --> "\texon"
#[Eventually] sort the HD21-2_new.gtf using libreoffice
#[Eventually] replace "" with "; "transcript_id with transcript_id; "\n with \n in kate
#-- STEP 0.3: PREPARE bed from gtf
#[Eventually] replace '\t0\t' with '\t.\t' in the gff-file in the kate-editor
gffread -E -F --bed HD21-2_new.gtf -o HD21-2_new.bed # .. loaded 2649 genomic features from HD21-2_new.gtf
#[IMPORTANT] delelte [*.1] in gtf, bed. In fa could be e.g. ">CP052994 Staphylococcus epidermidis strain HD21-2 chromosome, complete genome"
#GTF: CP052994.1 Genbank exon 1 1356 . + . transcript_id "gene-HKH70_00005"; gene_id "gene-HKH70_00005"; gene_name "dnaA"; Dbxref "NCBI_GP:QRJ41002.1"; Name "QRJ41002.1"; gbkey "CDS"; gene "dnaA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_017723675.1"; locus_tag "HKH70_00005"; product "chromosomal replication initiator protein DnaA"; protein_id "QRJ41002.1"; transl_table "11"; gene_biotype "protein_coding";
#BED: CP052994.1 0 1356 gene-HKH70_00005 100 + 0 1356 0,0,0 1 1356, 0, CDS=0:1356;CDSphase=0;geneID=gene-HKH70_00005;gene_name=dnaA;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=HKH70_00005
-------
-- 2 --
conda activate rnaseq
# --> !!!! SUCCESSFUL !!!!
nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/raw_data/*_R{1,2}.fq.gz' --fasta /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2.fa --gtf /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2_new.gtf --bed12 /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2_new.bed -profile standard --aligner hisat2 --fcGroupFeatures gene_id --fcGroupFeaturesType gene_biotype --saveReference --skip_genebody_coverage --skip_dupradar --skip_preseq --skip_edger -resume
#default fcGroupFeatures="gene_biotype" (or gbkey)
--skip_qc Skip all QC steps apart from MultiQC
--skip_fastqc Skip FastQC
--skip_rseqc Skip RSeQC
--skip_genebody_coverage Skip calculating genebody coverage
--skip_preseq Skip Preseq
--skip_dupradar Skip dupRadar (and Picard MarkDups)
--skip_edger Skip edgeR MDS plot and heatmap
--skip_multiqc Skip MultiQC
点赞本文的读者
还没有人对此文章表态
没有评论
Cross-Database Gene Annotation: Mapping Ensembl and UCSC Gene IDs
Updating Human Gene Identifiers using Ensembl BioMart: A Step-by-Step Guide
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
© 2023 XGenes.com Impressum