Comprehensive smallRNA-7 profiling using exceRpt pipeline with full reference databases (v3)

gene_x 0 like s 10 view s

Tags: pipeline

  1. Input data

    # name                         condition
    # ----------------------------------------------
    # 0403_WaGa_wt                 parental_cells_1.fastq.gz
    # #0505_WaGa_wt_EV_RNA         untreated_1.fastq.gz
    # #0505_WaGa_sT_DMSO_EV_RNA    DMSO_control_1.fastq.gz
    # #0505_WaGa_sT_Dox_EV_RNA     sT_knockdown_1.fastq.gz
    # #0505_WaGa_scr_DMSO_EV_RNA   scr_DMSO_control_1.fastq.gz
    # #0505_WaGa_scr_Dox_EV_RNA    scr_control_1.fastq.gz
    # #1905_WaGa_wt_EV_RNA         untreated_2.fastq.gz
    # #1905_WaGa_sT_DMSO_EV_RNA    DMSO_control_2.fastq.gz
    # #1905_WaGa_sT_Dox_EV_RNA     sT_knockdown_2.fastq.gz
    # #1905_WaGa_scr_DMSO_EV_RNA   scr_DMSO_control_2.fastq.gz
    # #1905_WaGa_scr_Dox_EV_RNA    scr_control_2.fastq.gz
    #
    # WaGa_wt_cells_1              parental_cells_2.fastq.gz
    # WaGa_wt_cells_2              parental_cells_3.fastq.gz
    # #2001_WaGa_sT_DMSO           DMSO_control_3.fastq.gz
    # #2001_WaGa_sT_Dox            sT_knockdown_3.fastq.gz
    # #2001_WaGa_scr_DMSO          scr_DMSO_control_3.fastq.gz
    # #2001_WaGa_scr_Dox           scr_control_3.fastq.gz
    #
    # WaGa_wt_cells_1              parental_cells_2_R2.fastq.gz
    # WaGa_wt_cells_2              parental_cells_3_R2.fastq.gz
    # #2001_WaGa_sT_DMSO           DMSO_control_3_R2.fastq.gz
    # #2001_WaGa_sT_Dox            sT_knockdown_3_R2.fastq.gz
    # #2001_WaGa_scr_DMSO          scr_DMSO_control_3_R2.fastq.gz
    # #2001_WaGa_scr_Dox           scr_control_3_R2.fastq.gz
    
    mkdir ~/DATA/Data_Ute/Data_Ute_smallRNA_7/raw_data
    cd raw_data
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_3/220617_NB501882_0371_AH7572BGXM/nf774/0403_WaGa_wt_S20_R1_001.fastq.gz parental_cells_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz untreated_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz DMSO_control_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz sT_knockdown_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz scr_DMSO_control_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz scr_control_1.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz untreated_2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz DMSO_control_2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz sT_knockdown_2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz scr_DMSO_control_2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz scr_control_2.fastq.gz
    
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf961/WaGaWTcells_1_S1_R1_001.fastq.gz parental_cells_2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf962/WaGaWTcells_2_S2_R1_001.fastq.gz parental_cells_3.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf971/2001_WaGa_sT_DMSO_S3_R1_001.fastq.gz DMSO_control_3.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf972/2001_WaGa_sT_Dox_S4_R1_001.fastq.gz sT_knockdown_3.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf973/2001_WaGa_scr_DMSO_S5_R1_001.fastq.gz scr_DMSO_control_3.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf974/2001_WaGa_scr_Dox_S6_R1_001.fastq.gz scr_control_3.fastq.gz
    
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf961/WaGaWTcells_1_S1_R2_001.fastq.gz parental_cells_2_R2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf962/WaGaWTcells_2_S2_R2_001.fastq.gz parental_cells_3_R2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf971/2001_WaGa_sT_DMSO_S3_R2_001.fastq.gz DMSO_control_3_R2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf972/2001_WaGa_sT_Dox_S4_R2_001.fastq.gz sT_knockdown_3_R2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf973/2001_WaGa_scr_DMSO_S5_R2_001.fastq.gz scr_DMSO_control_3_R2.fastq.gz
    ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf974/2001_WaGa_scr_Dox_S6_R2_001.fastq.gz scr_control_3_R2.fastq.gz
    
    #awk '{print $2}' temp3
    
  2. Adapter trimming

    #some common adapter sequences from different kits for reference:
    #    - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
    #    - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
    #    - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
    #    - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
    #    - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC *
    mkdir trimmed; cd trimmed
    for sample in parental_cells_1 untreated_1 DMSO_control_1 sT_knockdown_1 scr_DMSO_control_1 scr_control_1 untreated_2 DMSO_control_2 sT_knockdown_2 scr_DMSO_control_2 scr_control_2 parental_cells_2 parental_cells_3 DMSO_control_3 sT_knockdown_3 scr_DMSO_control_3 scr_control_3 parental_cells_2_R2 parental_cells_3_R2 DMSO_control_3_R2 sT_knockdown_3_R2 scr_DMSO_control_3_R2 scr_control_3_R2; do
      echo "------------------------------------ cutadapting the ${sample} -----------------------------------" >> LOG
      cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 5 --trim-n -o ${sample}.fastq.gz ../raw_data/${sample}.fastq.gz >> LOG
    done
    
    # In LOG file to look the differences of the R1 and R2 reads based on the statistics of trimming.
    
    #Reads with adapters:                10,114,799 (79.9%)
    #Reads with adapters:                   240,366 (1.9%)
    #Reads with adapters:                   233,380 (1.6%)
    #Reads with adapters:                   230,664 (1.3%)
    #Reads with adapters:                   207,717 (1.3%)
    #Reads with adapters:                   186,080 (1.2%)
    #Reads with adapters:                   577,429 (1.5%)
    #Reads with adapters:                   268,867 (1.7%)
    #Reads with adapters:                   325,300 (1.4%)
    #Reads with adapters:                   314,540 (1.5%)
    #Reads with adapters:                   264,349 (1.5%)
    
    #Reads with adapters:                   299,677 (0.7%)
    #Reads with adapters:                   108,801 (0.6%)
    #Reads with adapters:                     5,095 (0.0%)
    #Reads with adapters:                     6,989 (0.0%)
    #Reads with adapters:                     3,868 (0.0%)
    #Reads with adapters:                     2,173 (0.0%)
    
    #Reads with adapters:                   615,334 (1.4%)
    #Reads with adapters:                   258,388 (1.5%)
    #Reads with adapters:                   294,325 (1.4%)
    #Reads with adapters:                   336,932 (1.8%)
    #Reads with adapters:                   239,288 (2.0%)
    #Reads with adapters:                   117,544 (1.5%)
    
    #Alternatively, we can also cut adapter in the exceRpt built-in functions since 'grep "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC" /mnt/nvme0n1p1/MyexceRptDatabase/adapters/adapters.fa | wc -l' results in 48 records. However, explicitly cut adapter before is more ensured.
    
    #TODO: check if the R1 and R2 has the similar data distribution? Then decide if only R1 or both used for the downstream analysis?
    cat parental_cells_2.fastq.gz parental_cells_2_R2.fastq.gz > parental_cells_2_merged.fastq.gz
    cat parental_cells_3.fastq.gz parental_cells_3_R2.fastq.gz > parental_cells_3_merged.fastq.gz
    cat DMSO_control_3.fastq.gz DMSO_control_3_R2.fastq.gz > DMSO_control_3_merged.fastq.gz
    cat sT_knockdown_3.fastq.gz sT_knockdown_3_R2.fastq.gz > sT_knockdown_3_merged.fastq.gz
    cat scr_DMSO_control_3.fastq.gz scr_DMSO_control_3_R2.fastq.gz > scr_DMSO_control_3_merged.fastq.gz
    cat scr_control_3.fastq.gz scr_control_3_R2.fastq.gz > scr_control_3_merged.fastq.gz
    
    #Scenario   Option to use
    #-----------------------------
    #Trimming Read 1 only   -a
    #Trimming Read 2 only   -a
    #Trimming paired-end together   -a and -A
    #cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 5 --trim-n -o ${sample}_R2_trimmed.fastq.gz ../raw_data/${sample}_R2.fastq.gz
    cutadapt \
    -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC \
    -A TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC \
    -q 20 --minimum-length 5 --trim-n \
    -o ${sample}_R1_trimmed.fastq.gz -p ${sample}_R2_trimmed.fastq.gz \
    ../raw_data/${sample}_R1.fastq.gz ../raw_data/${sample}_R2.fastq.gz
    
    # -- check if it is necessary to remove adapter from 5'-end --
    #(Option_1) cutadapt -g TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -o /dev/null --report=minimal 0505_WaGa_wt_cutadapted.fastq.gz --> The trimming statistics in the output will show how often 5'-end adapters were removed.
    #(Option 2) zcat your_sample.fastq.gz | grep 'TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC' | head -n 20
    #(Option 3) fastqc your_sample.fastq.gz
    #Open the generated HTML report and check:
    #    The "Overrepresented sequences" section for adapter sequences.
    #    The "Per base sequence content" plot to see if there are unexpected sequences at the start of reads.
    #(If check results shows both ends contain adapter) cutadapt -g TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 10 -o ${sample}_trimmed.fastq.gz ${sample}.fastq.gz >> LOG2
    #    -g → Trims 5'-end adapters
    #    -a → Trims 3'-end adapters; -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC → Specifies the adapter sequence to be removed from the 3' end of the reads. The sequence provided is common in RNA-seq libraries (e.g., Illumina small RNA sequencing).
    #    -q 20 → Performs quality trimming at both read ends, removing bases with a Phred quality score below 20.
    
  3. Install exceRpt (https://github.gersteinlab.org/exceRpt/)

    docker pull rkitchen/excerpt
    mkdir MyexceRptDatabase
    cd /mnt/nvme0n1p1/MyexceRptDatabase
    wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg38_lowmem.tgz
    tar -xvf exceRptDB_v4_hg38_lowmem.tgz
    #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg19_lowmem.tgz
    #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg38_lowmem.tgz
    #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_mm10_lowmem.tgz
    wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_EXOmiRNArRNA.tgz
    tar -xvf exceRptDB_v4_EXOmiRNArRNA.tgz
    wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_EXOGenomes.tgz
    tar -xvf exceRptDB_v4_EXOGenomes.tgz
    
  4. Run exceRpt

    #[---- REAL_RUNNING_COMPLETE_DB ---->]
    #NOTE that if not renamed in the input files, then have to RENAME all files recursively by removing "_cutadapted.fastq" in all names in _CORE_RESULTS_v4.6.3.tgz (first unzip, removing, then zip, mv to ../results_g).
    cd trimmed
    #for file in *_cutadapted.fastq.gz; do
    #    echo "mv \"$file\" \"${file/_cutadapted.fastq/}\""
    #done
    for file in *.fastq.gz; do
        echo "mv \"$file\" \"${file/.fastq/}\""
    done
    
    mkdir results_exo6
    for sample in parental_cells_2 parental_cells_3 DMSO_control_3 sT_knockdown_3 scr_DMSO_control_3 scr_control_3    parental_cells_2_R2 parental_cells_3_R2 DMSO_control_3_R2 sT_knockdown_3_R2 scr_DMSO_control_3_R2 scr_control_3_R2    parental_cells_2_merged parental_cells_3_merged DMSO_control_3_merged sT_knockdown_3_merged scr_DMSO_control_3_merged scr_control_3_merged    parental_cells_1 untreated_1 DMSO_control_1 sT_knockdown_1 scr_DMSO_control_1 scr_control_1 untreated_2 DMSO_control_2 sT_knockdown_2 scr_DMSO_control_2 scr_control_2; do
        docker run -v ~/DATA/Data_Ute/Data_Ute_smallRNA_7/trimmed:/exceRptInput \
                   -v ~/DATA/Data_Ute/Data_Ute_smallRNA_7/results_exo6:/exceRptOutput \
                  -v /mnt/nvme0n1p1/MyexceRptDatabase:/exceRpt_DB \
                  -t rkitchen/excerpt \
                  INPUT_FILE_PATH=/exceRptInput/${sample}.gz MAIN_ORGANISM_GENOME_ID=hg38 N_THREADS=50 JAVA_RAM='200G' MAP_EXOGENOUS=on
    done
    
    #DEBUG the excerpt env
    docker inspect rkitchen/excerpt:latest
    # Without /bin/bash → May run and exit immediately
    #docker run -it rkitchen/excerpt
    # With /bin/bash → Stays open for interaction
    docker run -it --entrypoint /bin/bash rkitchen/excerpt
    
    #TODO: In the read2 exists the following adapter2, to test if the adapter can be identified and removed with the pipeline!
    
  5. Processing exceRpt output from multiple samples

    mkdir summaries_exo6
    cd ~/DATA/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master
    (r_env) jhuang@WS-2290C:~/DATA/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master$ R
    #WARNING: need to reload the R-script after each change of the script.
    source("mergePipelineRuns_functions.R")
    
    getwd()
    #[1] "/media/jhuang/Elements/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master"
    processSamplesInDir("../results_exo6/", "../summaries_exo6")
    
    #~/Tools/csv2xls-0.4/csv_to_xls.py exceRpt_miRNA_ReadsPerMillion.txt exceRpt_tRNA_ReadsPerMillion.txt exceRpt_piRNA_ReadsPerMillion.txt -d$'\t' -o exceRpt_results_detailed.xls
    
  6. mv results_exo6 results_exo7; mkdir results_exo6; sudo mv _R2 ../results_exo6; sudo mv _merged ../results_exo6

    mkdir summaries_exo7
    processSamplesInDir("../results_exo7/", "../summaries_exo7")
    
  7. Re-draw the heatmap plots

    # -- R-code --
    
        # Load required library
        library(dplyr)
    
        # Original vectors
        samples_orig <- c("untreated_2", "parental_cells_1", "parental_cells_2", "parental_cells_3", "scr_control_3",
                        "DMSO_control_3", "scr_DMSO_control_3", "sT_knockdown_3", "untreated_1", "DMSO_control_1",
                        "scr_control_1", "scr_DMSO_control_1", "DMSO_control_2", "sT_knockdown_2", "scr_control_2",
                        "scr_DMSO_control_2", "sT_knockdown_1")
    
        categories_orig <- c("reads_used_for_alignment", "genome", "miRNA_sense", "miRNA_antisense",
                            "miRNAprecursor_sense", "miRNAprecursor_antisense", "tRNA_sense", "tRNA_antisense",
                            "piRNA_sense", "piRNA_antisense", "gencode_sense", "gencode_antisense",
                            "circularRNA_sense", "circularRNA_antisense", "not_mapped_to_genome_or_libs",
                            "repetitiveElements", "endogenous_gapped", "exogenous_miRNA", "exogenous_rRNA",
                            "exogenous_genomes")
    
        # Provided samples and categories (desired order and format)
        samples <- c("parental_cells_1","parental_cells_2","parental_cells_3",
                    "untreated_1","untreated_2",
                    "scr_control_1","scr_control_2","scr_control_3",
                    "DMSO_control_1","DMSO_control_2","DMSO_control_3",
                    "scr_DMSO_control_1","scr_DMSO_control_2","scr_DMSO_control_3",
                    "sT_knockdown_1","sT_knockdown_2","sT_knockdown_3")
    
        categories <- c("reads_used_for_alignment", "genome", "miRNA", "miRNAprecursor", "tRNA", "piRNA",
                        "gencode", "circularRNA", "not_mapped_to_genome_or_libs", "repetitiveElements",
                        "endogenous_gapped", "exogenous_miRNA", "exogenous_rRNA", "exogenous_genomes")
    
        # Original data matrix
        data_orig <- matrix(c(
                        100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0,
                        21.3, 97.4, 99.0, 99.0, 89.2, 91.9, 90.6, 91.0, 44.9, 65.6, 69.2, 73.3, 71.9, 81.4, 78.3, 79.3, 78.5,
                        3.5, 3.7, 88.7, 86.6, 70.9, 81.1, 77.9, 79.3, 7.1, 12.9, 7.0, 7.5, 14.6, 16.2, 14.7, 15.3, 15.8,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0, 0.2, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        8.4, 0.5, 2.9, 3.0, 1.7, 1.3, 1.2, 1.4, 25.3, 41.2, 49.0, 52.1, 33.9, 45.3, 41.4, 47.3, 48.8,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.1, 0.0, 0.4, 0.5, 0.9, 1.6, 1.1, 1.4, 0.4, 0.4, 0.5, 0.4, 0.6, 0.3, 0.4, 0.4, 0.5,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        6.7, 86.0, 5.3, 6.9, 7.9, 4.6, 5.5, 4.9, 8.6, 8.5, 10.8, 11.2, 18.3, 15.7, 16.6, 12.9, 10.8,
                        0.7, 0.1, 0.2, 0.2, 0.5, 0.2, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3, 0.2, 0.3, 0.2, 0.2,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        78.7, 2.6, 1.0, 1.0, 10.8, 8.1, 9.4, 9.0, 55.1, 34.4, 30.8, 26.7, 28.1, 18.6, 21.7, 20.7, 21.5,
                        0.1, 0.0, 0.0, 0.0, 0.2, 0.1, 0.1, 0.2, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1,
                        0.3, 0.0, 0.1, 0.1, 0.7, 0.5, 0.6, 0.5, 1.3, 0.9, 0.8, 0.7, 0.6, 0.3, 0.3, 0.3, 0.5,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.2, 0.0, 0.0, 0.0, 0.3, 0.2, 0.2, 0.2, 1.5, 0.8, 0.8, 0.8, 0.7, 0.3, 0.3, 0.2, 0.5,
                        3.5, 0.0, 0.0, 0.0, 2.7, 1.6, 3.2, 2.2, 17.7, 9.3, 9.4, 6.9, 5.6, 2.4, 3.4, 3.3, 4.4), nrow = 20, byrow = TRUE)
    
        rownames(data_orig) <- categories_orig
        colnames(data_orig) <- samples_orig
    
        # Collapse sense/antisense
        merge_rows <- function(prefix) {
            row1 <- paste0(prefix, "_sense")
            row2 <- paste0(prefix, "_antisense")
            if (row1 %in% rownames(data_orig) && row2 %in% rownames(data_orig)) {
                return(data_orig[row1, ] + data_orig[row2, ])
            } else if (row1 %in% rownames(data_orig)) {
                return(data_orig[row1, ])
            } else {
                return(rep(0, ncol(data_orig)))
            }
        }
    
        # Construct merged data
        data_merged <- rbind(
        reads_used_for_alignment = data_orig["reads_used_for_alignment", ],
        genome = data_orig["genome", ],
        miRNA = merge_rows("miRNA"),
        miRNAprecursor = merge_rows("miRNAprecursor"),
        tRNA = merge_rows("tRNA"),
        piRNA = merge_rows("piRNA"),
        gencode = merge_rows("gencode"),
        circularRNA = merge_rows("circularRNA"),
        not_mapped_to_genome_or_libs = data_orig["not_mapped_to_genome_or_libs", ],
        repetitiveElements = data_orig["repetitiveElements", ],
        endogenous_gapped = data_orig["endogenous_gapped", ],
        exogenous_miRNA = data_orig["exogenous_miRNA", ],
        exogenous_rRNA = data_orig["exogenous_rRNA", ],
        exogenous_genomes = data_orig["exogenous_genomes", ]
        )
    
        # Reorder columns to match desired sample order
        data_final <- data_merged[, samples[samples %in% colnames(data_merged)]]
    
        #genome --> human_genome, not_mapped_to_genome_or_libs --> not_mapped_to_human_genome
        rownames(data_final)[rownames(data_final) == "genome"] <- "human_genome"
        rownames(data_final)[rownames(data_final) == "not_mapped_to_genome_or_libs"] <- "not_mapped_to_human_genome"
    
        # Save to Excel
        write.xlsx(data_final, file = "distribution_heatmap.xlsx", rowNames = TRUE)
    
    # -- Python-code --
    
        python ~/Scripts/plot_distribution_heatmap.py distribution_heatmap.xlsx distribution_heatmap.png
    
                import pandas as pd
                import numpy as np
                import seaborn as sns
                import matplotlib.pyplot as plt
    
                ## Load data from Excel file
                #file_path = "distribution_heatmap.xlsx"
                #
                ## Read Excel file, assuming first column is index (row labels)
                #df = pd.read_excel(file_path, index_col=0)
    
                # Convert percentages to decimals
                data = data / 100.0
    
                # Create DataFrame
                df = pd.DataFrame(data, index=categories, columns=samples)
    
                # Plot heatmap
                plt.figure(figsize=(14, 6))
                sns.heatmap(df, annot=True, cmap="coolwarm", fmt=".3f", linewidths=0.5, cbar_kws={'label': 'Fraction Aligned Reads'})
    
                # Improve layout
                plt.title("Heatmap of Read Alignments by Category and Sample", fontsize=14)
                plt.xlabel("Sample", fontsize=12)
                plt.ylabel("Read Category", fontsize=12)
                plt.xticks(rotation=15, ha="right", fontsize=10)
                plt.yticks(rotation=0, fontsize=10)
                plt.tight_layout()
    
                # Save as PNG
                plt.savefig("distribution_heatmap.png", dpi=300, bbox_inches="tight")
    
                # Show plot
                plt.show()
    
  8. Key steps of log: This log details the execution of a small RNA sequencing data analysis pipeline using the exceRpt tool (version 4.6.3) in a Docker container. The pipeline processes a human small RNA-seq dataset (testData_human.fastq.gz) with the following key steps:

    • Initial Setup

      • Docker container launched with mounted volumes for input/output and reference databases.
      • Parameters: hg38 genome, 50 threads, 200GB Java memory, exogenous mapping enabled.
      • Docker container launched with input/output volume mounts
      • 50 threads allocated with 200GB Java memory
      • hg38 reference genome specified
    • Preprocessing

      • Adapter detection and trimming using known adapter sequences.
      • Quality filtering (Phred score ≥20, length ≥18nt).
      • Removal of homopolymer-rich reads and low-quality sequences.
      • Input FASTQ file decompressed (testData_human.fastq.gz)
      • Adapter sequences identified using adapters.fa
      • Quality encoding determined (Phred+33/64)
      • Adapter clipping performed (TCGTATGCCGTCTTCTGCTTG)
      • Quality filtering (Q20, p<80%)
      • Homopolymer repeats filtered (max 66% single nt)
    • Contaminant Filtering

      • Alignment against UniVec contaminants and ribosomal RNA (rRNA) databases.
      • 322 reads processed, with statistics tracked at each step.
    • Endogenous RNA Analysis

      • Alignment to human genome (hg38) and transcriptome.
      • Quantification of small RNA types:
        • miRNA (mature/precursor): Sense strands detected (antisense absent).
        • tRNA, piRNA, gencode transcripts: Only sense strands reported.
        • circRNA: Not detected in this dataset.
      • Coverage and complexity metrics calculated.
    • Exogenous RNA Analysis

      • Screened for microbial/viral RNAs:
        • miRNA databases (miRBase).
        • Ribosomal RNA databases.
        • Comprehensive genomic databases (bacteria, plants, metazoa, fungi, viruses).
      • Taxonomic classification of exogenous hits performed.
    • QC & Results

      • QC Result: PASS (based on transcriptome/genome ratio >0.5 and >100k transcriptome reads).
      • Key Metrics:
        • Input Reads: ~1.5 million (exact count not shown in log).
        • Genome Mapped: Majority of reads.
        • Transcriptome Complexity: Calculated ratio.
      • Core results compressed into testData_human.fastq_CORE_RESULTS_v4.6.3.tgz.
    • Notable Observations:

      • Antisense Reads: Absent for miRNA, tRNA, and piRNA (common in small RNA-seq).
      • Potential Issues: Some files (e.g., antisense counts) were missing but did not disrupt pipeline.
      • Resource Usage: High RAM (200GB) and multi-threading (50 cores) employed for efficiency.
    • Output Files:

      • Quantified counts for endogenous RNAs (miRNA, tRNA, etc.).
      • Exogenous RNA alignments with taxonomic annotations.
      • QC report, adapter sequences, and alignment statistics.
  9. Downstream analyis using R for miRNAs

    # see http://xgenes.com/article/article-content/288/draw-plots-for-mirnas-generated-by-compsra/
    # see http://xgenes.com/article/article-content/289/draw-plots-for-pirna-generated-by-compsra/
    # see http://xgenes.com/article/article-content/290/draw-plots-for-snrna-generated-by-compsra/
    
    #Input file
    #exceRpt_miRNA_ReadCounts.txt
    #exceRpt_piRNA_ReadCounts.txt
    
    cd ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7
    mamba activate r_env
    R
    #> .libPaths()
    #[1] "/home/jhuang/mambaforge/envs/r_env/lib/R/library"
    
    #BiocManager::install("AnnotationDbi")
    #BiocManager::install("clusterProfiler")
    #BiocManager::install(c("ReactomePA","org.Hs.eg.db"))
    #BiocManager::install("limma")
    #BiocManager::install("sva")
    #install.packages("writexl")
    #install.packages("openxlsx")
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    library(limma)
    library(sva)
    #library(writexl)  #d.raw_with_rownames <- cbind(RowNames = rownames(d.raw), d.raw); write_xlsx(d.raw, path = "d_raw.xlsx");
    library(openxlsx)
    
    setwd("../summaries_exo7/")
    d.raw<- read.delim2("exceRpt_miRNA_ReadCounts.txt",sep="\t", header=TRUE, row.names=1)
    
    # Desired column order
    desired_order <- c(
        "parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"
    )
    # Reorder columns
    d.raw <- d.raw[, desired_order]
    setdiff(desired_order, colnames(d.raw))  # Shows missing or misnamed columns
    #sapply(d.raw, is.numeric)
    d.raw[] <- lapply(d.raw, as.numeric)
    #d.raw[] <- lapply(d.raw, function(x) as.numeric(as.character(x)))
    d.raw <- round(d.raw)
    write.csv(d.raw, file ="d_raw.csv")
    write.xlsx(d.raw, file = "d_raw.xlsx", rowNames = TRUE)
    
    # ------ Code sent to Ute ------
    #d.raw <- read.delim2("d_raw.csv",sep=",", header=TRUE, row.names=1)
    parental_or_EV = as.factor(c("parental","parental","parental", "EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV"))
    #donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
    batch = as.factor(c("Aug22","March25","March25", "Sep23","Sep23", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25"))
    
    replicates = as.factor(c("parental_cells","parental_cells","parental_cells",  "untreated","untreated",   "scr_control","scr_control","scr_control",  "DMSO_control","DMSO_control","DMSO_control",  "scr_DMSO_control", "scr_DMSO_control","scr_DMSO_control",  "sT_knockdown", "sT_knockdown", "sT_knockdown"))
    ids = as.factor(c("parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"))
    cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, batch=batch, parental_or_EV=parental_or_EV)
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+batch)
    
    # Filter low-count miRNAs
    dds <- dds[ rowSums(counts(dds)) > 10, ]  #1322-->903
    rld <- rlogTransformation(dds)
    
    # -- before pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    png("pca2.png", 1200, 800)
    #plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    plotPCA(rld, "batch")
    dev.off()
    
    # Batch Effect Removal Methods:
    
    #Applying batch effect correction techniques such as ComBat or SVA (Surrogate Variable Analysis).
    
    #- Using ComBat (from the sva package):
    
        # Assume `rld` is the rlog-transformed counts from DESeq2
        rld_corrected <- ComBat(dat = assay(rld), batch = cData$batch, mod = model.matrix(~ replicates, data = cData))
    
        # Visualize corrected PCA
        pca_corrected <- prcomp(t(rld_corrected))
        png("pca_after_batch_correction.png", 1200, 800)
        plot(pca_corrected$x[, 1:2], col = cData$replicates)
        dev.off()
    
    #- Using SVA (Surrogate Variable Analysis):
    
        #If batch effects are strong and you want to remove hidden batch effects, SVA can help identify latent factors. After identifying these latent factors, you can add them to the DESeq2 design.
    
        # Assume that rld contains the rlog-transformed data
        mod <- model.matrix(~ replicates, data = cData)  # This should include your main experimental variables
        sva_results <- sva(assay(rld), mod)
    
        #You would then adjust the design formula to include these latent variables.
    
    #- Using removeBatchEffect (CHOSEN!)
    
        #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
        mat <- assay(rld)
        mm <- model.matrix(~replicates, colData(rld))
        mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
        assay(rld) <- mat
    
    #- After batch effect removal, you should see a shift in the PCA plot — ideally, the samples should now cluster based on replicates or biological conditions rather than the batch.
    #If the batch effect has been successfully removed:
    #    * Before correction: You will likely see samples grouped by batch.
    #    * After correction: You should see the samples grouped by biological condition (e.g., parental, EV, scr_control, etc.).
    
        # -- after pca --
        png("pca_after_batch_correction.png", 1200, 800)
        #plotPCA(rld, intgroup = c("replicates", "batch"))
        #plotPCA(rld, intgroup = c("replicates", "ids"))
        plotPCA(rld, intgroup=c("replicates"))
        dev.off()
        png("pca_after_batch_correction2.png", 1200, 800)
        plotPCA(rld, "batch")
        dev.off()
    
        # -- after heatmap --
        ## generate the pairwise comparison between samples
        png("heatmap_after_batch_correction.png", 1200, 800)
        distsRL <- dist(t(assay(rld)))
        mat <- as.matrix(distsRL)
        rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
        #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
        hc <- hclust(distsRL)
        hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
        heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
        dev.off()
    
    #### STEP2: DEGs ####
    #- Heatmap untreated/wt vs parental; 1x for WaGa cell line
    #- Volcano plot untreated/wt vs parental; 1x for WaGa cell line
    #- Manhattan plot miRNAs; 1x for WaGa cell line
    #- Distribution of different small RNA species untreated/wt and parental; 1x for WaGa cell line
    #- Motif analysis: identify RNA-binding proteins that may regulate small RNA loading; 1x for WaGa cell line
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    sizeFactors(dds)
    normalized_counts <- counts(dds, normalized=TRUE)
    write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    write.xlsx(normalized_counts, file = "normalized_counts.xlsx", rowNames = TRUE)
    
    #---- untreated, scr_control, DMSO_control, scr_DMSO_control, sT_knockdown to parental_cells ----
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+batch)
    
    dds$replicates <- relevel(dds$replicates, "parental_cells")
    dds = DESeq(dds, betaPrior=FALSE)  #default betaPrior is FALSE
    resultsNames(dds)
    clist <- c("untreated_vs_parental_cells")
    
    for (i in clist) {
        contrast = paste("replicates", i, sep="_")
        res = results(dds, name=contrast)
        res <- res[!is.na(res$log2FoldChange),]
        #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
        res$padj <- ifelse(is.na(res$padj), 1, res$padj)
        res_df <- as.data.frame(res)
        write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
        up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
        down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
        write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
        write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    untreated_vs_parental_cells-all.txt \
    untreated_vs_parental_cells-up.txt \
    untreated_vs_parental_cells-down.txt \
    -d$',' -o untreated_vs_parental_cells.xls;
    
    # ------------------- volcano_plot -------------------
    library(ggplot2)
    library(ggrepel)
    
    geness_res <- read.csv(file = paste("untreated_vs_parental_cells", "all.txt", sep="-"), row.names=1)
    
    external_gene_name <- rownames(geness_res)
    geness_res <- cbind(geness_res, external_gene_name)
    #top_g are from ids
    top_g <- c("hsa-let-7b-5p","hsa-let-7g-5p","hsa-let-7i-5p","hsa-miR-103a-3p","hsa-miR-107","hsa-miR-1224-5p","hsa-miR-122-5p","hsa-miR-1226-5p","hsa-miR-1246","hsa-miR-127-3p","hsa-miR-1290","hsa-miR-130a-3p","hsa-miR-139-3p","hsa-miR-141-3p","hsa-miR-143-3p","hsa-miR-148b-3p","hsa-miR-155-5p","hsa-miR-15a-5p","hsa-miR-17-5p","hsa-miR-184","hsa-miR-18a-3p","hsa-miR-18a-5p","hsa-miR-190a-5p","hsa-miR-191-5p","hsa-miR-193b-5p","hsa-miR-197-5p","hsa-miR-200a-3p","hsa-miR-200b-5p","hsa-miR-206","hsa-miR-20a-5p","hsa-miR-210-3p","hsa-miR-2110","hsa-miR-21-5p","hsa-miR-218-5p","hsa-miR-219a-1-3p","hsa-miR-221-3p","hsa-miR-23b-3p","hsa-miR-27a-3p","hsa-miR-27b-3p","hsa-miR-27b-5p","hsa-miR-28-3p","hsa-miR-30a-5p","hsa-miR-30c-5p","hsa-miR-30e-5p","hsa-miR-3127-5p","hsa-miR-3131","hsa-miR-3180|hsa-miR-3180-3p","hsa-miR-320a","hsa-miR-320b","hsa-miR-320c","hsa-miR-320d","hsa-miR-330-3p","hsa-miR-335-3p","hsa-miR-33b-5p","hsa-miR-340-5p","hsa-miR-342-5p","hsa-miR-3605-5p","hsa-miR-361-3p","hsa-miR-365a-5p","hsa-miR-374b-5p","hsa-miR-378i","hsa-miR-379-5p","hsa-miR-3940-5p","hsa-miR-409-3p","hsa-miR-411-5p","hsa-miR-423-3p","hsa-miR-423-5p","hsa-miR-4286","hsa-miR-429","hsa-miR-432-5p","hsa-miR-4326","hsa-miR-451a","hsa-miR-4520-3p","hsa-miR-454-3p","hsa-miR-4646-5p","hsa-miR-4667-5p","hsa-miR-4748","hsa-miR-483-5p","hsa-miR-486-5p","hsa-miR-5010-5p","hsa-miR-504-3p","hsa-miR-5187-5p","hsa-miR-590-3p","hsa-miR-6128","hsa-miR-625-5p","hsa-miR-6726-5p","hsa-miR-6730-5p","hsa-miR-676-3p","hsa-miR-6767-5p","hsa-miR-6777-5p","hsa-miR-6780a-5p","hsa-miR-6794-5p","hsa-miR-6817-3p","hsa-miR-708-5p","hsa-miR-7-5p","hsa-miR-766-5p","hsa-miR-7854-3p","hsa-miR-873-3p","hsa-miR-885-3p","hsa-miR-92b-5p","hsa-miR-93-5p","hsa-miR-937-3p","hsa-miR-9-5p","hsa-miR-98-5p")
    subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
    geness_res$Color <- "NS or log2FC < 2.0"
    geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
    geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
    geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
    
    write.csv(geness_res, "untreated_vs_parental_cells_with_Category.csv")
    geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
    
    geness_res <- geness_res[, -1*ncol(geness_res)]
    png("volcano_plot_untreated_vs_parental_cells.png",width=1200, height=1400)
    #svg("untreated_vs_parental_cells.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    # ------------------ differentially_expressed_miRNAs_heatmap -----------------
    # prepare all_genes
    rld <- rlogTransformation(dds)
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    assay(rld) <- mat
    RNASeq.NoCellLine <- assay(rld)
    
    # reorder the columns
    #colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
    #col.order <-c("control MKL1",  "control WaGa","0505 WaGa wt","1905 WaGa wt","0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox")
    #RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
    
    #Option4: manully defining
    #for i in untreated_vs_parental_cells; do
    #  echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id";
    #  echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id";
    #done
    #cat *.id | sort -u > ids
    ##add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    datamat = RNASeq.NoCellLine[GOI, ]
    
    # clustering the genes and draw heatmap
    #datamat <- datamat[,-1]  #delete the sample "control MKL1"
    datamat <- datamat[, 1:5]
    
    colnames(datamat)[1] <- "parental cells 1"
    colnames(datamat)[2] <- "parental cells 2"
    colnames(datamat)[3] <- "parental cells 3"
    colnames(datamat)[4] <- "untreated 1"
    colnames(datamat)[5] <- "untreated 2"
    
    write.csv(datamat, file ="gene_expression_keeping_replicates.txt")
    write.xlsx(datamat, file = "gene_expression_keeping_replicates.xlsx", rowNames = TRUE)
    #"ward.D"’, ‘"ward.D2"’,‘"single"’, ‘"complete"’, ‘"average"’ (= UPGMA), ‘"mcquitty"’(= WPGMA), ‘"median"’ (= WPGMC) or ‘"centroid"’ (= UPGMC)
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.1)
    mycol = c("YELLOW", "BLUE", "ORANGE", "CYAN", "GREEN", "MAGENTA", "GREY", "LIGHTCYAN", "RED",     "PINK", "DARKORANGE", "MAROON",  "LIGHTGREEN", "DARKBLUE",  "DARKRED",   "LIGHTBLUE", "DARKCYAN",  "DARKGREEN", "DARKMAGENTA");
    mycol = mycol[as.vector(mycl)]
    
    rownames(datamat) <- sub("\\|.*", "", rownames(datamat))
    
    png("DEGs_heatmap_keeping_replicates.png", width=800, height=1200)
    #svg("DEGs_heatmap_keeping_replicates.svg", width=6, height=8)
    heatmap.2(as.matrix(datamat),
        Rowv=as.dendrogram(hr),
        Colv=NA,
        dendrogram='row',
        labRow=row.names(datamat),
        scale='row',
        trace='none',
        col=bluered(75),
        RowSideColors=mycol,
        srtCol=20,
        lhei=c(1,8),
        cexRow=1.2,   # Increase row label font size
        cexCol=1.7,    # Increase column label font size
        margin=c(7, 10)
        )
    dev.off()
    
    # ----------- manhattan_plot -------------
    
    # TODO_TOMORROW: the top miRNA should different, since we want to see the differentially expressed miRNA, therefore we should show the top DEG miRNA, find the top-5 and mark the 5 as the red points and give the label!
    # TODO_piRNA
    # TODO: Both motiv calling!
    # TODO: send the results to Ute!
    
    # Load the required libraries
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(ggrepel)  # For better label positioning
    
    # Step 1: Compute RPM from raw counts (d.raw has miRNAs in rows, samples in columns)
    d.raw_5 <- d.raw[, 1:5]  # assuming 5 samples
    total_counts <- colSums(d.raw_5)
    RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6
    
    # Step 2: Prepare long-format dataframe
    RPM$miRNA <- rownames(RPM)
    df <- pivot_longer(RPM, cols = -miRNA, names_to = "sample", values_to = "RPM")
    
    # Step 3: Log-transform RPM
    df <- df %>%
    mutate(logRPM = log10(RPM + 1))
    
    # Step 4: Add miRNA index for x-axis positioning
    df <- df %>%
    arrange(miRNA) %>%
    group_by(sample) %>%
    mutate(Position = row_number())
    
    # Step 5: Identify top miRNAs based on mean RPM
    top_mirnas <- df %>%
    group_by(miRNA) %>%
    summarise(mean_RPM = mean(RPM)) %>%
    arrange(desc(mean_RPM)) %>%
    head(5) %>%
    pull(miRNA)  # Get the names of top 5 miRNAs
    
    # Step 6: Assign color based on whether the miRNA is top or not
    df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue")
    
    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )
    
    # Step 7: Plot
    png("manhattan_plot_top_miRNAs_based_on_mean_RPM.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(miRNA %in% top_mirnas),
        aes(label = miRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()
    
    top_mirnas = c("hsa-miR-20a-5p","hsa-miR-93-5p","hsa-let-7g-5p","hsa-miR-30a-5p","hsa-miR-423-5p","hsa-let-7i-5p")
    #,"hsa-miR-17-5p","hsa-miR-107","hsa-miR-483-5p","hsa-miR-9-5p","hsa-miR-103a-3p","hsa-miR-30e-5p","hsa-miR-21-5p","hsa-miR-30d-5p")
    
    # Step 6: Assign color based on whether the miRNA is top or not
    df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue")
    
    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )
    
    # Step 7: Plot
    png("manhattan_plot_most_differentially_expressed_miRNAs.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(miRNA %in% top_mirnas),
        aes(label = miRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()
    
    mkdir miRNAs
    mv *.png miRNAs
    mv *.svg miRNAs
    mv *.csv miRNAs
    mv *.xls* miRNAs
    mv *.id miRNAs
    mv ids miRNAs
    mv normalized_counts.txt miRNAs
    mv *-all.txt miRNAs
    mv *-up.txt miRNAs
    mv *-down.txt miRNAs
    mv gene_expression_keeping_replicates.txt miRNAs
    cd miRNAs
    mv DEGs_heatmap_keeping_replicates.png differentially_expressed_miRNAs_heatmap.png
    mv volcano_plot_untreated_vs_parental_cells.png volcano_plot_miRNAs_untreated_vs_parental_cells.png
    mv untreated_vs_parental_cells.xls miRNA_untreated_vs_parental_cells.xls
    
  10. Downstream analyis using R for piRNAs

    d.raw<- read.delim2("exceRpt_piRNA_ReadCounts.txt",sep="\t", header=TRUE, row.names=1)
    
    # Desired column order
    desired_order <- c(
        "parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"
    )
    # Reorder columns
    d.raw <- d.raw[, desired_order]
    setdiff(desired_order, colnames(d.raw))  # Shows missing or misnamed columns
    #sapply(d.raw, is.numeric)
    d.raw[] <- lapply(d.raw, as.numeric)
    #d.raw[] <- lapply(d.raw, function(x) as.numeric(as.character(x)))
    d.raw <- round(d.raw)
    write.csv(d.raw, file ="d_raw.csv")
    write.xlsx(d.raw, file = "d_raw.xlsx", rowNames = TRUE)
    
    #Make the piRNA names shorter, e.g. "hsa_piR_016658|gb|DQ592931|Homo_sapiens:6:80508363:80508389:Plus" --> "hsa_piR_016658"
    #paste -d',' f1_1 f2_ > d_raw_.csv
    d.raw <- read.delim2("d_raw_.csv",sep=",", header=TRUE, row.names=1)
    parental_or_EV = as.factor(c("parental","parental","parental", "EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV"))
    #donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
    batch = as.factor(c("Aug22","March25","March25", "Sep23","Sep23", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25"))
    
    replicates = as.factor(c("parental_cells","parental_cells","parental_cells",  "untreated","untreated",   "scr_control","scr_control","scr_control",  "DMSO_control","DMSO_control","DMSO_control",  "scr_DMSO_control", "scr_DMSO_control","scr_DMSO_control",  "sT_knockdown", "sT_knockdown", "sT_knockdown"))
    ids = as.factor(c("parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"))
    cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, batch=batch, parental_or_EV=parental_or_EV)
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+batch)
    
    # Filter low-count miRNAs
    dds <- dds[ rowSums(counts(dds)) > 10, ]  #364-->124
    rld <- rlogTransformation(dds)
    
    # -- before pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    png("pca2.png", 1200, 800)
    #plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    plotPCA(rld, "batch")
    dev.off()
    
    # Batch Effect Removal Methods:
    
    #Applying batch effect correction techniques such as ComBat, SVA (Surrogate Variable Analysis) or limma::removeBatchEffect.
    
        #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
        mat <- assay(rld)
        mm <- model.matrix(~replicates, colData(rld))
        mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
        assay(rld) <- mat
    
    #- After batch effect removal, you should see a shift in the PCA plot — ideally, the samples should now cluster based on replicates or biological conditions rather than the batch.
    #If the batch effect has been successfully removed:
    #    * Before correction: You will likely see samples grouped by batch.
    #    * After correction: You should see the samples grouped by biological condition (e.g., parental, EV, scr_control, etc.).
    
        # -- after pca --
        png("pca_after_batch_correction.png", 1200, 800)
        #plotPCA(rld, intgroup = c("replicates", "batch"))
        #plotPCA(rld, intgroup = c("replicates", "ids"))
        plotPCA(rld, intgroup=c("replicates"))
        dev.off()
        png("pca_after_batch_correction2.png", 1200, 800)
        plotPCA(rld, "batch")
        dev.off()
    
        # -- after heatmap --
        ## generate the pairwise comparison between samples
        png("heatmap_after_batch_correction.png", 1200, 800)
        distsRL <- dist(t(assay(rld)))
        mat <- as.matrix(distsRL)
        rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
        #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
        hc <- hclust(distsRL)
        hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
        heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
        dev.off()
    
    #### STEP2: DEGs ####
    #- Heatmap untreated/wt vs parental; 1x for WaGa cell line
    #- Volcano plot untreated/wt vs parental; 1x for WaGa cell line
    #- Manhattan plot miRNAs; 1x for WaGa cell line
    #- Distribution of different small RNA species untreated/wt and parental; 1x for WaGa cell line
    #- Motif analysis: identify RNA-binding proteins that may regulate small RNA loading; 1x for WaGa cell line
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    sizeFactors(dds)
    normalized_counts <- counts(dds, normalized=TRUE)
    write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    write.xlsx(normalized_counts, file = "normalized_counts.xlsx", rowNames = TRUE)
    
    #---- untreated, scr_control, DMSO_control, scr_DMSO_control, sT_knockdown to parental_cells ----
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+batch)
    
    dds$replicates <- relevel(dds$replicates, "parental_cells")
    dds = DESeq(dds, betaPrior=FALSE)  #default betaPrior is FALSE
    resultsNames(dds)
    clist <- c("untreated_vs_parental_cells")
    
    for (i in clist) {
        contrast = paste("replicates", i, sep="_")
        res = results(dds, name=contrast)
        res <- res[!is.na(res$log2FoldChange),]
        #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
        res$padj <- ifelse(is.na(res$padj), 1, res$padj)
        res_df <- as.data.frame(res)
        write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
        up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
        down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
        write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
        write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    untreated_vs_parental_cells-all.txt \
    untreated_vs_parental_cells-up.txt \
    untreated_vs_parental_cells-down.txt \
    -d$',' -o untreated_vs_parental_cells.xls;
    
    # ------------------- volcano_plot -------------------
    library(ggplot2)
    library(ggrepel)
    
    geness_res <- read.csv(file = paste("untreated_vs_parental_cells", "all.txt", sep="-"), row.names=1)
    
    external_gene_name <- rownames(geness_res)
    geness_res <- cbind(geness_res, external_gene_name)
    #top_g are from ids
    top_g <- c("hsa_piR_000805","hsa_piR_001152","hsa_piR_001170","hsa_piR_001205","hsa_piR_009051","hsa_piR_010894","hsa_piR_012681","hsa_piR_012753","hsa_piR_016659","hsa_piR_017033","hsa_piR_017178","hsa_piR_018292","hsa_piR_018780","hsa_piR_019420","hsa_piR_020009","hsa_piR_020326","hsa_piR_020813","hsa_piR_020814","hsa_piR_020828")
    subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
    geness_res$Color <- "NS or log2FC < 2.0"
    geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
    geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
    geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
    
    write.csv(geness_res, "untreated_vs_parental_cells_with_Category.csv")
    geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
    
    geness_res <- geness_res[, -1*ncol(geness_res)]
    png("volcano_plot_piRNAs_untreated_vs_parental_cells.png",width=1200, height=1400)
    #svg("untreated_vs_parental_cells.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    # ------------------ differentially_expressed_piRNAs_heatmap -----------------
    # prepare all_genes
    rld <- rlogTransformation(dds)
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    assay(rld) <- mat
    RNASeq.NoCellLine <- assay(rld)
    
    #Option4: manully defining
    #for i in untreated_vs_parental_cells; do
    #  echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id";
    #  echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id";
    #done
    #cat *.id | sort -u > ids
    ##add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    datamat = RNASeq.NoCellLine[GOI, ]
    
    # clustering the genes and draw heatmap
    #datamat <- datamat[,-1]  #delete the sample "control MKL1"
    datamat <- datamat[, 1:5]
    
    colnames(datamat)[1] <- "parental cells 1"
    colnames(datamat)[2] <- "parental cells 2"
    colnames(datamat)[3] <- "parental cells 3"
    colnames(datamat)[4] <- "untreated 1"
    colnames(datamat)[5] <- "untreated 2"
    
    write.csv(datamat, file ="gene_expression_keeping_replicates.txt")
    write.xlsx(datamat, file = "gene_expression_keeping_replicates.xlsx", rowNames = TRUE)
    #"ward.D"’, ‘"ward.D2"’,‘"single"’, ‘"complete"’, ‘"average"’ (= UPGMA), ‘"mcquitty"’(= WPGMA), ‘"median"’ (= WPGMC) or ‘"centroid"’ (= UPGMC)
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.1)
    mycol = c("YELLOW", "BLUE", "ORANGE", "CYAN", "GREEN", "MAGENTA", "GREY", "LIGHTCYAN", "RED",     "PINK", "DARKORANGE", "MAROON",  "LIGHTGREEN", "DARKBLUE",  "DARKRED",   "LIGHTBLUE", "DARKCYAN",  "DARKGREEN", "DARKMAGENTA");
    mycol = mycol[as.vector(mycl)]
    
    rownames(datamat) <- sub("\\|.*", "", rownames(datamat))
    
    png("differentially_expressed_piRNAs_heatmap.png", width=800, height=800)
    #svg("differentially_expressed_piRNAs_heatmap.svg", width=6, height=8)
    heatmap.2(as.matrix(datamat),
        Rowv=as.dendrogram(hr),
        Colv=NA,
        dendrogram='row',
        labRow=row.names(datamat),
        scale='row',
        trace='none',
        col=bluered(75),
        RowSideColors=mycol,
        srtCol=20,
        lhei=c(1,4),
        cexRow=1.7,   # Increase row label font size
        cexCol=1.7,    # Increase column label font size
        margin=c(6, 12)
        )
    dev.off()
    
    # ----------- manhattan_plot -------------
    
    # Load the required libraries
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(ggrepel)  # For better label positioning
    
    # Step 1: Compute RPM from raw counts (d.raw has piRNAs in rows, samples in columns)
    d.raw_5 <- d.raw[, 1:5]  # assuming 5 samples
    total_counts <- colSums(d.raw_5)
    RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6
    
    # Step 2: Prepare long-format dataframe
    RPM$piRNA <- rownames(RPM)
    df <- pivot_longer(RPM, cols = -piRNA, names_to = "sample", values_to = "RPM")
    
    # Step 3: Log-transform RPM
    df <- df %>%
    mutate(logRPM = log10(RPM + 1))
    
    # Step 4: Add piRNA index for x-axis positioning
    df <- df %>%
    arrange(piRNA) %>%
    group_by(sample) %>%
    mutate(Position = row_number())
    
    # Step 5: Identify top piRNAs based on mean RPM
    top_pirnas <- df %>%
    group_by(piRNA) %>%
    summarise(mean_RPM = mean(RPM)) %>%
    arrange(desc(mean_RPM)) %>%
    head(5) %>%
    pull(piRNA)  # Get the names of top 5 piRNAs
    
    # Step 6: Assign color based on whether the piRNA is top or not
    df$color <- ifelse(df$piRNA %in% top_pirnas, "red", "darkblue")
    
    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )
    
    # Step 7: Plot
    png("manhattan_plot_top_piRNAs_based_on_mean_RPM.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(piRNA %in% top_pirnas),
        aes(label = piRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()
    
    top_pirnas = c("hsa_piR_012681","hsa_piR_012753","hsa_piR_001152","hsa_piR_020813","hsa_piR_020828")
    
    # Step 6: Assign color based on whether the piRNA is top or not
    df$color <- ifelse(df$piRNA %in% top_pirnas, "red", "darkblue")
    
    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )
    
    # Step 7: Plot
    png("manhattan_plot_most_differentially_expressed_piRNAs.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(piRNA %in% top_pirnas),
        aes(label = piRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()
    
    mkdir piRNAs
    mv *.png piRNAs
    mv *.csv piRNAs
    mv *.xls* piRNAs
    mv *.id piRNAs
    mv ids piRNAs
    mv normalized_counts.txt piRNAs
    mv *-all.txt piRNAs
    mv *-up.txt piRNAs
    mv *-down.txt piRNAs
    mv gene_expression_keeping_replicates.txt piRNAs
    cd piRNAs
    mv untreated_vs_parental_cells.xls piRNA_untreated_vs_parental_cells.xls
    
  11. Reporting

    Please find attached the analysis results for small RNAs in the WaGa cell line. miRNAs:

    * Heatmap comparing untreated/wt vs. parental (1x):
    See differentially_expressed_miRNAs_heatmap.png
    
    * Volcano plot comparing untreated/wt vs. parental (1x):
    See volcano_plot_miRNAs_untreated_vs_parental_cells.png
    
    * Manhattan plots highlighting top differentially expressed miRNAs (1x):
    See manhattan_plot_most_differentially_expressed_miRNAs.png and manhattan_plot_top_miRNAs_based_on_mean_RPM.png
    

    piRNAs:

    * Heatmap comparing untreated/wt vs. parental (1x):
    See differentially_expressed_piRNAs_heatmap.png
    
    * Volcano plot comparing untreated/wt vs. parental (1x):
    See volcano_plot_piRNAs_untreated_vs_parental_cells.png
    
    * Manhattan plots highlighting top differentially expressed piRNAs (1x):
    See manhattan_plot_most_differentially_expressed_piRNAs.png and manhattan_plot_top_piRNAs_based_on_mean_RPM.png
    

    Additional

    * Distribution of small RNA species (untreated/wt vs. parental, 1x):
    See distribution_heatmap.png
    
    * Differential expression tables:
    
        - miRNA_untreated_vs_parental_cells.xls
        - piRNA_untreated_vs_parental_cells.xls
    
        These files contain all differentially expressed miRNAs and piRNAs, respectively.
    

    If you’d like the R code used to generate the plots, along with the raw data and full tables, just let me know—I’ll be happy to send it over.

  12. TODO: Motif analysis: identify RNA-binding proteins that may regulate small RNA loading; 1x for WaGa cell line

    #-          RNA fragmentation patterns: is EV-RNA full length or fragmented?
    
    RNA-Seq Data
    
    -          PCA plot untreated/wt vs parental cells; 1x für WaGa cell line und 1x für MKL-1 cells
    
    -          Heatmap untreated/wt vs parental; 1x für WaGa cell line und 1x für MKL-1 cells
    
    -          Volcano plot untreated/wt vs parental; 1x für WaGa cell line und 1x für MKL-1 cells
    
    -          Distribution of different RNA Species untreated/wt and parental; 1x für WaGa cell line und 1x für MKL-1 cells
    
    -          RNA binding protein motifs: do we find specific motifs in EV-RNA?
    
            ✅ 5. RNA Binding Protein (RBP) Motifs: Do we find specific motifs in EV-RNA?
            🎯 Goal:
    
            Find RBP binding motifs enriched in extracellular vesicle (EV) RNA compared to parental/total cellular RNA.
            🔬 Workflow:
            Step 1: Get EV-RNA sequences
    
                Use transcript-level quantification (from RNA-Seq) to get upregulated transcripts in EV-RNA (vs parental).
    
                Extract their FASTA sequences (e.g., via Ensembl/biomaRt or gtf+fasta).
    
            # Use gffread or bedtools to extract sequences
            gffread -w ev_rna_transcripts.fa -g genome.fa ev_rna.gtf
    
            Step 2: Run motif discovery
    
            Use MEME, HOMER, or DREME to find enriched short motifs in EV-RNA transcripts.
    
            meme ev_rna_transcripts.fa -rna -oc meme_ev -nmotifs 10 -minw 6 -maxw 8
    
            Step 3: Match motifs to known RBP databases
    
            Use Tomtom (part of MEME Suite) to compare discovered motifs to known RBP motifs (e.g., from ATtRACT, CISBP-RNA, or RBPDB):
    
            tomtom meme_ev/meme.txt RBP_motif_database.meme -oc tomtom_out
    
            Step 4: Interpret results
    
                Identify RBPs whose motifs are significantly enriched in EV-RNA.
    
                Cross-check with known EV-associated RBPs (e.g., YBX1, hnRNPs, ALYREF, etc.).
    
    -          miRNA target analysis (Please see the small RNA analysis!)
    
            ✅ 6. miRNA Target Analysis
            🎯 Goal:
    
            Determine which genes are regulated by significant miRNAs (e.g., those enriched in EV-RNA or differentially expressed between untreated/wt vs parental).
            🔬 Workflow:
            Step 1: Get list of significant miRNAs
    
            From your DE analysis (small RNA-seq).
            Step 2: Retrieve target genes
    
            Use target prediction tools or databases:
    
                TargetScan (context++ scores)
    
                miRDB
    
                miRTarBase (experimentally validated)
    
                Or: use multiMiR package in R
    
            Example (R + multiMiR):
    
            library(multiMiR)
            results <- get_multimir(mirna = c("hsa-miR-21-5p", "hsa-miR-155-5p"),
                                    table = "validated", summary = TRUE)
            targets <- unique(results@data$target_symbol)
    
            Step 3: Check overlap with your RNA-seq DEGs
    
            See if predicted/validated targets are downregulated in RNA-seq.
    
            # Example: overlap with downregulated DEGs
            intersect(targets, downregulated_genes)
    
            Step 4: Pathway enrichment
    
            Use tools like:
    
                clusterProfiler (R)
    
                Enrichr
    
                DAVID or g:Profiler
    
            to analyze the biological functions of target genes.
            🧼 Summary Table
            Analysis Step   Tool    Input   Output
            RBP Motif Enrichment    MEME + Tomtom   EV-RNA sequences    Candidate RBPs
            miRNA Target Analysis   multiMiR / TargetScan   Significant miRNAs  Target genes & functions
    
    small RNA-Seq
    
    -          Heatmap untreated/wt vs parental; 1x für WaGa cell line
    
    -          Volcano plot untreated/wt vs parental; 1x für WaGa cell line
    
    -          Manhattan plot miRNAs; 1x für WaGa cell line
    
    -          Distribution of different small RNA Species untreated/wt and parental; 1x für WaGa cell line
    
    -          Motif analysis: identify RNA-binding proteins that may regulate small RNA loading; 1x für WaGa cell line
    
            🧪 Goal
    
            Identify RNA-binding proteins (RBPs) that might interact with or regulate small RNA (miRNA) loading into Argonaute complexes—possibly affecting miRNA stability or function.
            🔬 Suggested Workflow
            1. Extract miRNA Sequences
    
                Use miRBase to get mature miRNA sequences for your significant miRNAs.
    
                Output them in FASTA format.
    
            # Example: Get FASTA sequences
            # From miRBase or locally via Bioconductor in R
    
            2. Perform Motif Enrichment Analysis
    
            Use tools that can identify overrepresented sequence motifs in your miRNAs and then associate them with known RBP binding motifs.
            ✅ Tools for Motif Discovery:
    
                MEME Suite (MEME, DREME):
    
                    Input: miRNA sequences
    
                    Output: enriched motifs
    
                    Can be run via web interface or command line
    
                HOMER:
    
                    Particularly good for short motifs (like 6–8mers)
    
                    Input: FASTA + background set (non-significant miRNAs)
    
            Example MEME:
    
            meme miRNAs.fasta -rna -mod zoops -nmotifs 5 -minw 6 -maxw 8 -oc meme_out/
    
            3. Map Discovered Motifs to RBP Binding Motifs
    
            Once you have enriched motifs, the next step is to match them to known RBP motifs:
            ✅ Tools or Databases:
    
                ATtRACT or RBPDB – databases of known RBP motifs.
    
                Tomtom (part of MEME Suite) – compares discovered motifs to known motif databases.
    
            tomtom -oc tomtom_out/ meme_out/meme.txt RBP_motif_database.meme
    
            4. Integrate with Expression Data in WaGa Cells
    
                Check whether candidate RBPs are expressed in WaGa cells.
    
                    Use RNA-seq data (if available) or public datasets (e.g., DepMap, CCLE).
    
                    Filter out RBPs that are not expressed or very lowly expressed.
    
            5. Cross-reference with Literature
    
                For top RBPs:
    
                    Check if they are known to interact with miRNAs or Ago2-loading.
    
                    Prioritize RBPs involved in miRNA maturation, transport, or loading, e.g., HNRNPA1, DDX proteins, FMR1, etc.
    
            🔬 Suggested Workflow
            1. Extract miRNA Sequences
    
                Use miRBase to get mature miRNA sequences for your significant miRNAs.
    
                Output them in FASTA format.
    
                # Example: Get FASTA sequences
                # From miRBase or locally via Bioconductor in R
    
                wget ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz
                gunzip mature.fa.gz
                cp ids my_mirnas.txt
                seqkit grep -f my_mirnas.txt mature.fa > extracted_miRNAs.fa
    
            2. Perform Motif Enrichment Analysis
    
            Use tools that can identify overrepresented sequence motifs in your miRNAs and then associate them with known RBP binding motifs.
            ✅ Tools for Motif Discovery:
    
                MEME Suite (MEME, DREME):
    
                    Input: miRNA sequences
                    Output: enriched motifs
                    Can be run via web interface or command line
    
                HOMER:
    
                    Particularly good for short motifs (like 6–8mers)
                    Input: FASTA + background set (non-significant miRNAs)
    
            Example MEME:
    
                meme miRNAs.fasta -rna -mod zoops -nmotifs 5 -minw 6 -maxw 8 -oc meme_out/
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum