GAP 1.0.0 Manual

GAP(GTF Annotation Parser) is a tool writen by Python 2.7 to parse Gencode/Ensembl GTF file and NCBI GFF3 annotation file. By input a GTF/GFF3 file, it produce a *.genomeCoor.bed and a *.transCoor.bed files to easily read by human eyes. And users can get transcript information by use a *.genomeCoor.bed file and PaseTrans.py module.

1. Requirements

# Suppose $GAP_PATH is the path of GAP
export PYTHONPATH=$PYTHONPATH:$GAP_PATH

2. Parse Gencode/Ensembl GTF File

Gencode GTF: https://www.gencodegenes.org
Enesembl GTF: https://asia.ensembl.org/info/data/ftp/index.html

Example

# get genome fasta file
wget ftp://ftp.ensembl.org/pub/release-91/fasta/anolis_carolinensis/dna/Anolis_carolinensis.AnoCar2.0.dna_sm.toplevel.fa.gz
gzip -d Anolis_carolinensis.AnoCar2.0.dna_sm.toplevel.fa.gz

# get genome gtf annotation file
wget ftp://ftp.ensembl.org/pub/release-91/gtf/anolis_carolinensis/Anolis_carolinensis.AnoCar2.0.91.gtf.gz
gzip -d Anolis_carolinensis.AnoCar2.0.91.gtf.gz

# Parse
python $GAP_PATH/parseBedFromGTF.py \
    -g Anolis_carolinensis.AnoCar2.0.91.gtf \
    --genome Anolis_carolinensis.AnoCar2.0.dna_sm.toplevel.fa \
    -o Anolis_carolinensis \
    -s gencode

After these steps, it will produce 3 files:

Skip to part 5 to see what does these files mean.

3. Parse NCBI GFF3 File

NCBI GFF3: ftp://ftp.ncbi.nlm.nih.gov/genomes

Example

# get genome fasta file
rm *.fa
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Equus_caballus/Assembled_chromosomes/seq/*.fa.gz
gzip -d *fa.gz
cat * | awk -F "|" '{if(substr($0,1,1)==">"){print ">"$4}else{print $0}}' > Equus_caballus.fa

# get genome gtf annotation file
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Equus_caballus/GFF/ref_EquCab2.0_top_level.gff3.gz
gzip -d ref_EquCab2.0_top_level.gff3.gz

# Parse
python $GAP_PATH/parseBedFromGTF.py \
    -g ref_EquCab2.0_top_level.gff3 \
    --genome Equus_caballus.fa \
    -o Equus_caballus \
    -s ncbi \
    --genome_gff_same_chr yes

the raw fasta head cannot be recognized by pysam, so we should cut the ID from head

>gi|194246387|ref|NC_009149.2| Equus caballus isolate Twilight breed thoroughbred chromosome 6, EquCab2.0, whole genome shotgun sequence

4. Get Transcript Information from genomeCoor File

Examples

4.1 Get mRNA CDS sequence

from ParseTrans import *

genomeCoorBedFileName = "Anolis_carolinensis.genomeCoor.bed"
parseTrans = ParseTransClass(genomeCoorBedFile = genomeCoorBedFileName)
transFeature = parseTrans.getTransFeature(transID='ENSACAT00000008961')
print transFeature # transFeature is a dict

The content of transFeature(all coordination is 1-based):

# =-=-=-=-=-=-=-=-=-=-=-=-=-transID Info Show=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
                cds_end:        2514                     
              cds_start:        379                      
                    chr:        1                        
                    end:        11245407                 
               exon_str:        11245077-11245407,11241108-11241722,11238269-11238487,11236340-11236559,11234789-11234912,11233819-11234022,11232622-11232792,11231970-11232098,11228762-11228860,11227948-11228173,11226911-11227016,11225867-11225930,11220692-11225448
                gene_id:        ENSACAG00000008890       
              gene_name:        GOLGA5                   
              gene_type:        protein_coding           
                  start:        11220692                 
                 strand:        -                        
              trans_len:        7265                     
              utr_3_end:        7265                     
            utr_3_start:        2515                     
              utr_5_end:        378                      
            utr_5_start:        1                        
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

Get CDS sequence from transcriptome produce by $GP_PATH/parseBedFromGTF.py

from seq import cutSeq
transcriptomeFile = "Anolis_carolinensis_transcriptome.fa"
parseTrans.addSeq(seqFileName = transcriptomeFile)

OUT = open("CDS.fa", "w")
for trans_id in parseTrans.TransIParser:
    RNA = parseTrans.TransIParser[trans_id]
    cds_start = RNA['cds_start']
    cds_end = RNA['cds_end']
    if RNA['gene_type'] in ('mRNA', 'protein_coding'):
        print >>OUT, ">%s %d-%d\n%s" % ( trans_id, cds_start, cds_end, cutSeq(parseTrans.getTransFeatureSeq(transID=trans_id, feature='cds', source='Gencode')) )

OUT.close()

4.2 Get longest transcript for each gene

geneInfo = parseTrans.getGeneInfo()
longest_trans = {}
for gene_id in geneInfo:
    transcripts = geneInfo[gene_id]['transcript']
    longest_trans_id = transcripts[0]
    longest_trans_len = parseTrans.getTransFeature(transID=longest_trans_id)['trans_len']
    for trans_id in transcripts[1:]:
        cur_len = parseTrans.getTransFeature(transID=trans_id)['trans_len']
        if cur_len > longest_trans_len:
            longest_trans_len = cur_len
            longest_trans_id = trans_id
    longest_trans[gene_id] = longest_trans_id

5. File Format

5.1 *.genomeCoor.bed file

chr1    462439  468348  -       LOC106782066=106782066  XM_014733432.1  protein_coding  468082-468348,462439-466445     468082-468348,466385-466445,462439-465136
chr1    502861  504192  +       LOC100146619=100146619  XR_001378865.1  lncRNA  502861-502937,503598-504192
chr1    522033  526528  -       ZNF511=100066579        XM_014732975.1  protein_coding  526451-526528,525107-525358,524775-524899,523241-523369,522033-522450   522033-522371
chr1    531522  552416  +       TUBGCP2=100066555       XM_014733360.1  protein_coding  531522-531707,532971-533099,533997-534173,535520-535679,540702-540909,541239-541438,541868-542057,548953-549098,550688-550868,551554-552416     531522-531557,551774-552416

5.2 *.transCoor.bed file

XM_014733432.1  LOC106782066=106782066  protein_coding  4274    1-267,268-4274  1-267,268-328,1577-4274
XR_001378865.1  LOC100146619=100146619  lncRNA  672     1-77,78-672
XM_014732975.1  ZNF511=100066579        protein_coding  1002    1-78,79-330,331-455,456-584,585-1002    664-1002
XM_014733360.1  TUBGCP2=100066555       protein_coding  2440    1-186,187-315,316-492,493-652,653-860,861-1060,1061-1250,1251-1396,1397-1577,1578-2440  1-36,1798-2440

5.3 *_transcriptome.fa

>XM_014733432.1 106782066|LOC106782066|protein_coding
GGCTCCCTGGCTGCGCCCTGGGCCTCCTCTGAGGACTGGAGTGAGGAGCAGGCAGTGTGG
GTGGCCAGTACTGGCCACCGACAGCCTGTGATATTCAGGCCCTTGGTGCCCAGAACAGGG
CCTGAGTCACCGTGCGCAAGGACCCAGTGAACAAGCTCCACACGCCTTGTCAGCGGGAGA
TTCGGTGGAGAAGGAGCAGGCAGCTGCCTCGAGTAGGCCAGGACACAGCAGAAGGAGCGC
ACTCCCGTACCCGTTTGGAGCGTGCCGTCTGTCTGTCTGTCCTGATCTCTTGCAGCCTGA

The head part is constituted with transcriptID, GeneID, GeneName, GeneType

6. Compare CDS/UTR length of Gencode/RefSeq annotation


from ParseTrans import *

def statistic_UTR_dist(in_genomeCoor_file):
    parseTrans = ParseTransClass(genomeCoorBedFile = in_genomeCoor_file)
    utr5_len_dict = {}
    utr3_len_dict = {}
    cds_len_dict = {}
    for trans_id in parseTrans.TransIParser:
        RNA = parseTrans.TransIParser[trans_id]
        gene_name = RNA['gene_name'].upper()
        if gene_name == "NULL": continue
        if RNA['gene_type'] in ('mRNA', 'protein_coding'):
            cds_start = RNA['cds_start']
            cds_end = RNA['cds_end']
            length = RNA['trans_len']
            utr5_len = cds_start - 1
            utr3_len = length - cds_end
            cds_len = cds_end - cds_start + 1
            if utr5_len > 0:
                utr5_len_dict[gene_name] = utr5_len_dict.get(gene_name, []) + [utr5_len]
            if utr3_len > 0:
                utr3_len_dict[gene_name] = utr3_len_dict.get(gene_name, []) + [utr3_len]
            if cds_len > 0:
                cds_len_dict[gene_name] = cds_len_dict.get(gene_name, []) + [cds_len]
    return utr5_len_dict, utr3_len_dict, cds_len_dict


utr5_len_1, utr3_len_1, cds_len_1 = statistic_UTR_dist("hg38_gencode.genomeCoor.bed")
utr5_len_2, utr3_len_2, cds_len_2 = statistic_UTR_dist("hg38_ncbi.genomeCoor.bed")

utr5_len_1, utr3_len_1, cds_len_1 = statistic_UTR_dist("mm10_gencode.genomeCoor.bed")
utr5_len_2, utr3_len_2, cds_len_2 = statistic_UTR_dist("mm10_ncbi.genomeCoor.bed")



def plot_single_anno(UTR5_Set, UTR3_Set, CDS_Set):
    import seaborn as sns
    dataSet = []
    for gene_name in UTR5_Set:
        dataSet += [ [item, "UTR5"] for item in UTR5_Set[gene_name] ]
    for gene_name in UTR3_Set:
        dataSet += [ [item, "UTR3"] for item in UTR3_Set[gene_name] ]
    for gene_name in CDS_Set:
        dataSet += [ [item, "CDS"] for item in CDS_Set[gene_name] ]
    dataSet = pd.DataFrame(dataSet, columns=['length', 'type'])
    # show some statistics
    print "median of UTR5: ", dataSet.loc[dataSet.type=="UTR5", "length"].median()
    print "median of UTR3: ", dataSet.loc[dataSet.type=="UTR3", "length"].median()
    print "median of CDS: ", dataSet.loc[dataSet.type=="CDS", "length"].median()
    # plot
    sns.violinplot(data=dataSet, x='type', y='length')
    plt.xlabel("mRNA Part", fontsize="large")
    plt.ylabel("Length Distribution", fontsize="large")
    plt.ylim(-2000,5000)
    plt.show()


plot_single_anno(utr5_len_1, utr3_len_1, cds_len_1)
plot_single_anno(utr5_len_2, utr3_len_2, cds_len_2)


def compair_anno(Gencode_1, RefSeq_2, title):
    import numpy as np
    import seaborn as sns
    dataSet = []
    for gene_name in Gencode_1:
        if gene_name in RefSeq_2:
            dataSet.append( [np.median(Gencode_1[gene_name]), "Gencode"] )
            dataSet.append( [np.median(RefSeq_2[gene_name]), "RefSeq"] )
    dataSet = pd.DataFrame(dataSet, columns=['length', 'type'])
    # show some statistics
    print "median of Gencode: ", dataSet.loc[dataSet.type=="Gencode", "length"].median()
    print "median of RefSeq: ", dataSet.loc[dataSet.type=="RefSeq", "length"].median()
    # plot
    sns.violinplot(data=dataSet, x='type', y='length')
    plt.xlabel(title, fontsize="large")
    plt.ylabel("Length Distribution", fontsize="large")
    plt.ylim(-500,2000)
    plt.show()

compair_anno(utr5_len_1, utr5_len_2, "5UTR")
compair_anno(utr3_len_1, utr3_len_2, "3UTR")
compair_anno(cds_len_1, cds_len_2, "CDS")