Pipeline

System requirements

  • Linux/Unix

Preprocess

Usage:

usage: bash m6a_pipeline.sh <samp> <ref(human, mouse)> <workpath> <refpath> <anno_dir>
<samp> : sample name
<ref>  : hg38/mm10
<workpath> : location of where data files are in
<refpath> : location of where ref_dir are in
<anno_dir> : location of where anno.gtf is in

m6a_pipeline.sh

#!/bin/bash
#You should index reference genome first

samp=$1
ref=$2
work_dir=$3
ref_dir=$4
anno_dir=$5
data_dir=$work_dir/01.Raw
trim_dir=$work_dir/02.Trim
bam_dir=$work_dir/03.Bam


# check input param
if [ $# -lt 4 ]; then
  echo "usage: bash m6a_pipeline.sh <samp> <ref(human, mouse)> <workpath> <refpath> <anno_dir>"
  echo "<samp> : sample name"
  echo "<ref>  : hg38/mm10"
  echo "<workpath> : location of where data files are in"
  echo "<refpath> : location of where ref_dir are in"
  echo "<anno_dir> : location of where anno.gtf is in"
  exit
fi


#check STAR index
if [ -f $ref_dir/$ref.fa ]||[ -f $anno_dir/$ref.utr.chr.gtf ];then
   if [ ! -f $ref_dir/$ref/*.tab ];then
      echo "You need build STAR index first!"
      STAR --runThreadN 8 --runMode genomeGenerate --genomeDir $ref_dir/$ref --genomeFastaFiles $ref_dir/$ref.fa --sjdbGTFfile $anno_dir/$ref.utr.chr.gtf --sjdbOverhang 149
      STAR --runThreadN 8 --runMode genomeGenerate --genomeDir $ref_dir/${ref}_control --genomeFastaFiles $ref_dir/$ref.fa $ref_dir/control.fa --sjdbGTFfile $anno_dir/$ref.gtf --sjdbOverhang 149
   else
      echo "STAR Index has already built"
   fi
fi



#Trim_galore
echo "Step.01--Trim Raw Data and Get Clean Data"

if [ ! -d $trim_dir ];then
   mkdir $trim_dir
fi

trim_galore -j 7 --phred33 --paired --stringency 3 -e 0.1 $data_dir/${samp}.R1.fastq.gz $data_dir/${samp}.R2.fastq.gz --gzip -o $trim_dir > $trim_dir/${samp}.log 2>&1

echo "Step.01 Done!"


#STAR mapping
echo "Step.02--Mapping Trimmed Reads"

if [ ! -d $bam_dir ];then
   mkdir $bam_dir
fi

if [ ! -d $bam_dir/CHRbam ];then
   mkdir $bam_dir/CHRbam
fi


STAR --runThreadN 30 --runMode alignReads --genomeDir $ref_dir/$ref --readFilesCommand zcat --readFilesIn $trim_dir/${samp}.R1_val_1.fq.gz $trim_dir/${samp}.R2_val_2.fq.gz --outFileNamePrefix $bam_dir/${samp}_ --outSAMtype BAM SortedByCoordinate  --outSAMstrandField intronMotif --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04
STAR --runThreadN 30 --runMode alignReads --genomeDir $ref_dir/${ref}_control --readFilesCommand zcat --readFilesIn $trim_dir/${samp}.R1_val_1.fq.gz $trim_dir/${samp}.R2_val_2.fq.gz  --outFileNamePrefix $bam_dir/${samp}_cont --outSAMtype BAM SortedByCoordinate  --outSAMstrandField intronMotif --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04

/home1/changzhanhe/basic_code/bam_add_chr.sh $bam_dir/${samp}_Aligned.sortedByCoord.out.bam $bam_dir/${samp}.CHR.bam
mv $bam_dir/${samp}.CHR.bam $bam_dir/CHRbam/


echo "Step.02 Done!"


##Flagstat
echo "Step.03--Flagstat Bam Files"

samtools flagstat $bam_dir/${samp}_contAligned.sortedByCoord.out.bam > $bam_dir/${samp}.map.flag.txt

echo "Step.03 Done!"


##IP effiecncy
echo "Step.04--Mapping Efficiency"

if [ ! -f $bam_dir/gluc_cluc_reads_count.txt ];then
   awk 'BEGIN{OFS="\t";print "sample","gluc","cluc","gluc/cluc"}' > $bam_dir/gluc_cluc_reads_count.txt
fi

gluc=`bamtools count -in $bam_dir/${samp}_contAligned.sortedByCoord.out.bam -region Modified_Control_RNA`
cluc=`bamtools count -in $bam_dir/${samp}_contAligned.sortedByCoord.out.bam -region Unmodified_Control_RNA`
awk 'BEGIN{OFS="\t";print "'$samp'","'$gluc'","'$cluc'",'$gluc'/'$cluc'}' >> $bam_dir/gluc_cluc_reads_count.txt


echo "Step.04 Done!"

rm $bam_dir/${samp}_contAligned.sortedByCoord.out.bam


##Decay
echo "Step.05--Decay condition"
java -jar /home1/xuxc/02.bin/picard.jar CollectRnaSeqMetrics I=$bam_dir/${samp}_Aligned.sortedByCoord.out.bam \
   O=$bam_dir/${samp}.RNA_Metrics \
   REF_FLAT=$anno_dir/refFlat.txt \
   CHART_OUTPUT=$bam_dir/${samp}.pdf \
   STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND

echo "Step.05 Done!"