# downloaded from ftp://ftp.sanger.ac.uk/pub/gencode/gencode.4.DCC.tgz # on 2010-05-07 cd /hive/groups/encode/dcc/data/gencodeV4 # create tables from hgsql hg19 create table wgEncodeGencodeAutoV4 select * from wgEncodeGencodeAutoV3 limit 0; create table wgEncodeGencodeClassesV4 select * from wgEncodeGencodeClassesV3 limit 0; create table wgEncodeGencodeManualV4 select * from wgEncodeGencodeManualV3 limit 0; create table wgEncodeGencodePolyaV4 select * from wgEncodeGencodePolyaV3 limit 0; #build classes table echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_1_2.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_3.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 echo "LOAD DATA LOCAL INFILE 'gencode.v4.tRNAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 echo "LOAD DATA LOCAL INFILE 'gencode.v4.polyAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4.annotation.GRCh37.level_1_2.gtf > reload_manual.out 2>&1 # Reading gencode_v4.annotation.GRCh37.level_1_2.gtf # Read 101466 transcripts in 1322945 lines in 1 files # 101466 groups 24 seqs 1 sources 6 feature types # 101466 gene predictions ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4.annotation.GRCh37.level_3.gtf > reload_auto_level3.out 2>&1 # Read 41171 transcripts in 811166 lines in 1 files # 41171 groups 25 seqs 1 sources 6 feature types # 41171 gene predictions ldHgGene -exon=tRNAscan -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf > reload_auto_level_tRNAs.out 2>&1 #Reading gencode.v4.tRNAs.GRCh37.gtf # Read 622 transcripts in 622 lines in 1 files # 622 groups 24 seqs 1 sources 1 feature types # 622 gene predictions grep -v polyA_signal gencode.v4.polyAs.GRCh37.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v4.polyAs.GRCh37.header.gtf cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.signal.gtf cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.pseudo.gtf cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.site.gtf grep polyA_signal gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.signal.gtf grep pseudo_polyA gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.pseudo.gtf grep polyA_site gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.site.gtf ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.signal.gtf > load_polyA.out 2>&1 ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.pseudo.gtf >> load_polyA.out 2>&1 ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.site.gtf >> load_polyA.out 2>&1 ### 2010-07-02 Download update V4 files from FelixK (braney) cd /hive/groups/encode/dcc/data/gencodeV4 wget "ftp://ftp.sanger.ac.uk/pub/gencode/other_data/gencode_v4fixed.tgz" # -rw-r--r-- fsk/ensembl 247107660 2010-07-02 08:26:21 gencode_v4fix.annotation.GRCh37.level_3.gtf # -rw-r--r-- fsk/ensembl 486686715 2010-07-02 08:26:39 gencode_v4fix.annotation.GRCh37.level_1_2.gtf #### 2010-07-29 trying to get this sorted (braney) mkdir old mv * old wget "ftp://ftp.sanger.ac.uk/pub/gencode/other_data/gencode_v4fixed.tgz" tar xvfz gencode_v4fixed.tgz # gencode_v4fix.annotation.GRCh37.level_3.gtf # gencode_v4fix.annotation.GRCh37.level_1_2.gtf # reload auto and manual tables ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gtf # Reading gencode_v4fix.annotation.GRCh37.level_1_2.gtf # Read 101466 transcripts in 1322945 lines in 1 files # 101466 groups 24 seqs 1 sources 6 feature types # 101466 gene predictions ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4fix.annotation.GRCh37.level_3.gtf # Reading gencode_v4fix.annotation.GRCh37.level_3.gtf # Read 41171 transcripts in 811166 lines in 1 files # 41171 groups 25 seqs 1 sources 6 feature types # 41171 gene predictions cp old/gencode.v4.tRNAs.GRCh37.gtf . ldHgGene -exon=tRNAscan -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf # Reading gencode.v4.tRNAs.GRCh37.gtf # Read 622 transcripts in 622 lines in 1 files # 622 groups 24 seqs 1 sources 1 feature types # 622 gene predictions # update the downloads cat gencode.v4.tRNAs.GRCh37.gtf gencode_v4fix.annotation.GRCh37.level_3.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeAutoV4.gtf.gz cat gencode_v4fix.annotation.GRCh37.level_1_2.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeManualV4.gtf.gz # get Yale mappings for the pseudogenes wget "http://homes.gersteinlab.org/people/yhl3/pgenes/gencode/wgEncodeGencodePgenes.v4.Human58.tsv" #added following entries to the trackDb for wgEncodeGencode # yalePseudoAssoc wgEncodeGencodeYalePseudoV4 # yaleUrl http://tables.pseudogene.org/ hgLoadSqlTab hg19 wgEncodeGencodeYalePseudoV4 ~/kent/src/hg/lib/yaleGencodeAssoc.sql wgEncodeGencodePgenes.v4.Human58.tsv cat wgEncodeGencodePgenes.v4.Human58.tsv | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeYalePseudoV4.tsv.gz # load 2wayconspeudo mv old/gencode.v3.2wayconspseudos.GRCh37.gtf . ldHgGene -exon=transcript -noncoding -genePredExt hg19 wgEncodeGencode2wayConsPseudo gencode.v3.2wayconspseudos.GRCh37.gtf echo "rename table wgEncodeGencode2wayConsPseudo to wgEncodeGencode2wayConsPseudoV4;" | hgsql hg19 cat gencode.v3.2wayconspseudos.GRCh37.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencode2wayConsPseudoV4.gtf.gz # build metadata for i in YalePseudo 2wayConsPseudo Manual Auto Classes PolyA do table="wgEncodeGencode"$i"V4" echo "metaObject $table" echo "objType table" echo "annotation Automatic" echo "composite wgEncodeGencode" echo "dataType Gencode" echo "dataVersion ENCODE May 2010 Freeze" echo "dateSubmitted 2010-05-01" echo "dateUnrestricted 2011-02-01" echo "fileName "$table".gtf.gz" echo "grant Hubbard" echo "lab Sanger" echo "level 3" echo "project wgEncode" echo "submittedDataVersion V4" echo "tableName "$table echo done # reloading the "right" way so that the gene_name in the GTF ends up in name2 # of the genePred # ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gtf gtfToGenePred -genePredExt -geneNameAsName2 gencode_v4fix.annotation.GRCh37.level_1_2.gtf stdout | sed 's/ENSTR/ENST0/' > gencode_v4fix.annotation.GRCh37.level_1_2.gp hgLoadGenePred -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gp # ldHgGene -exon=transcript -noncoding -genePredExt hg19 wgEncodeGencode2wayConsPseudo gencode.v3.2wayconspseudos.GRCh37.gtf tawk '$3=="transcript"{$3="exon"}{print $0}' gencode.v3.2wayconspseudos.GRCh37.gtf |gtfToGenePred -genePredExt stdin gencode.v3.2wayconspseudos.GRCh37.gp hgLoadGenePred -genePredExt hg19 wgEncodeGencode2wayConsPseudoV4 gencode.v3.2wayconspseudos.GRCh37.gp # ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4fix.annotation.GRCh37.level_3.gtf # ldHgGene -exon=tRNAscan -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf gtfToGenePred -genePredExt -geneNameAsName2 gencode_v4fix.annotation.GRCh37.level_3.gtf stdout | sed 's/ENSTR/ENST0/' > gencode_v4fix.annotation.GRCh37.level_3.gp tawk '$3=="tRNAscan"{$3="exon"}{print $0}' gencode.v4.tRNAs.GRCh37.gtf | gtfToGenePred -genePredExt stdin gencode.v4.tRNAs.GRCh37.gp cat gencode_v4fix.annotation.GRCh37.level_3.gp gencode.v4.tRNAs.GRCh37.gp | hgLoadGenePred -genePredExt hg19 wgEncodeGencodeAutoV4 stdin # ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.signal.gtf > load_polyA.out 2>&1 # ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.pseudo.gtf >> load_polyA.out 2>&1 # ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.site.gtf >> load_polyA.out 2>&1 #tawk '$3=="polyA_signal"{$3="exon"}{print $0}' gencode.v4.polyAs.GRCh37.signal.gtf | gtfToGenePred -genePredExt stdin gencode.v4.polyAs.GRCh37.signal.gp # didn't reload PolyA since names are garbage and in V3 all name = name2 #build classes table # echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_1_2.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 # echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_3.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 # echo "LOAD DATA LOCAL INFILE 'gencode.v4.tRNAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 # echo "LOAD DATA LOCAL INFILE 'gencode.v4.polyAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 # echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19 cat gencode_v4.annotation.GRCh37.level_1_2.classes gencode_v4.annotation.GRCh37.level_3.classes gencode.v4.tRNAs.GRCh37.classes gencode.v4.polyAs.GRCh37.classes gencode.v3.2wayconspseudos.GRCh37.classes | hgLoadSqlTab hg19 wgEncodeGencodeClassesV4 ~/kent/src/hg/lib/gencodeGeneClass.sql stdin