2011-12-28: import of GENCODE V10 (markd) # Due to UCSC Genome Browser using the NC_001807 mitochondrial genome sequence # (chrM) and GENCODE annotating the NC_012920 mitochondrial sequence, the # GENCODE mitochondrial sequences are not loaded # download files mkdir -p /hive/groups/encode/dcc/data/gencodeV10/release cd /hive/groups/encode/dcc/data/gencodeV10/release wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.2wayconspseudos.gtf.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.annotation.gtf.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.long_noncoding_RNAs.gtf.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.pc_transcripts.fa.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.pc_translations.fa.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.polyAs.gtf.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.tRNAs.gtf.gz wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode10_GRCh37.tgz # NOTE: this was missing, was in V9, ask Sanger wget -nv ftp://ftp.sanger.ac.uk/pub/gencode/release_10/gencode.v10.long_noncoding_RNAs.fa.gz # silly sanity check: for f in * ; do zcat $f >/dev/null ; done # untar main distribution tar -zxf gencode10_GRCh37.tgz # create Makefile from previous one cd /hive/groups/encode/dcc/data/gencodeV10/ cp ../gencodeV9/Makefile . # edit as needed # on code in the CCDS subversion tree: # svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/svnroot/hausslerlab/ccds/trunk # and markd's python library (it will be moved to the hausslerlab # repository soon) # may need to update ccds2/modules/gencode/src/lib/gencode/data/gencodeGenes.py # to add new biotypes, use this command to verify and update as needed make checkAttrs (time nice make -j 10) >&build.out& # took ~45 minutes, log below ### IMPORTANT: make sure that hgTracks/simpleTracks.c registers ### track handler for this version of gencode: registerTrackHandlerOnFamily("wgEncodeGencodeV10", gencodeGeneMethods); ------------------------------------------------------------------------------ cat release/gencode_release_10/gencode.v10.annotation.level_1_2.gtf release/gencode_release_10/gencode.v10.annotation.level_3.gtf | gencodeGtfToGenePred /dev/stdin data/gencodeManAuto.gp.hgwdev.22170.tmp cat release/gencode_release_10/gencode.v10.annotation.level_1_2.gtf release/gencode_release_10/gencode.v10.annotation.level_3.gtf | gencodeGtfToAttrs /dev/stdin data/gencodeManAuto.tsv.hgwdev.22170.tmp mkdir -p tables/ mkdir -p tables/ mkdir -p tables/ gencodePolyaGtfToGenePred release/gencode_release_10/gencode.v10.polyAs.gtf tables/wgEncodeGencodePolyaV10.gp.hgwdev.22170.tmp tawk '$3=="transcript"{$3 = "exon"} {print $0}' release/gencode_release_10/gencode.v10.2wayconspseudos.gtf | gtfToGenePred stdin tables/wgEncodeGencode2wayConsPseudoV10.gp.hgwdev.22170.tmp cp release/gencode_release_10/metadata/gencode.v10.metadata.Gene_source tables/wgEncodeGencodeGeneSourceV10.tab.hgwdev.22170.tmp cp release/gencode_release_10/metadata/gencode.v10.metadata.Transcript_source tables/wgEncodeGencodeTranscriptSourceV10.tab.hgwdev.22170.tmp cp release/gencode_release_10/metadata/gencode.v10.metadata.Transcript_supporting_feature tables/wgEncodeGencodeTranscriptSupportV10.tab.hgwdev.22170.tmp tawk '{split($5,coord,":|-"); print $1,$2,$3,$4,coord[1],coord[2]-1,coord[3]}' release/gencode_release_10/metadata/gencode.v10.metadata.Exon_supporting_feature | sort -k 1,1 -k 2,2 -k 5,5 -k 6,6n > tables/wgEncodeGencodeExonSupportV10.tab.hgwdev.22170.tmp mkdir -p tables/ cp release/gencode_release_10/metadata/gencode.v10.metadata.PDB tables/wgEncodeGencodePdbV10.tab.hgwdev.22170.tmp mkdir -p tables/ cp release/gencode_release_10/metadata/gencode.v10.metadata.Pubmed_id tables/wgEncodeGencodePubMedV10.tab.hgwdev.22170.tmp mv -f tables/wgEncodeGencodePdbV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodePdbV10.tab mkdir -p tables/ cp release/gencode_release_10/metadata/gencode.v10.metadata.RefSeq tables/wgEncodeGencodeRefSeqV10.tab.hgwdev.22170.tmp mv -f tables/wgEncodeGencodeGeneSourceV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeGeneSourceV10.tab (tawk '{print $0,"SwissProt"}' release/gencode_release_10/metadata/gencode.v10.metadata.SwissProt && tawk '{print $0,"TrEMBL"}' release/gencode_release_10/metadata/gencode.v10.metadata.TrEMBL) | sort -k 1,1 > tables/wgEncodeGencodeUniProtV10.tab.hgwdev.22170.tmp mv -f tables/wgEncodeGencodeRefSeqV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeRefSeqV10.tab tawk '{print $1,gensub("\\\\n|\\\\","","g",$2)}' release/gencode_release_10/metadata/gencode.v10.metadata.Annotation_remark | sort -k 1,1 > tables/wgEncodeGencodeAnnotationRemarkV10.tab.hgwdev.22170.tmp mv -f tables/wgEncodeGencodeTranscriptSupportV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTranscriptSupportV10.tab mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeGeneSourceV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeGeneSource.sql tables/wgEncodeGencodeGeneSourceV10.tab Scanning through 1 files mv -f tables/wgEncodeGencode2wayConsPseudoV10.gp.hgwdev.22170.tmp tables/wgEncodeGencode2wayConsPseudoV10.gp mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSupportV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSupport.sql tables/wgEncodeGencodeTranscriptSupportV10.tab mv -f tables/wgEncodeGencodeTranscriptSourceV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTranscriptSourceV10.tab mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePdbV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePdb.sql tables/wgEncodeGencodePdbV10.tab touch loaded/wgEncodeGencodeGeneSourceV10.tab.loaded mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeRefSeqV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeRefSeq.sql tables/wgEncodeGencodeRefSeqV10.tab Scanning through 1 files mv -f tables/wgEncodeGencodePubMedV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodePubMedV10.tab flock load.lock hgLoadGenePred hg19 wgEncodeGencode2wayConsPseudoV10 tables/wgEncodeGencode2wayConsPseudoV10.gp touch loaded/wgEncodeGencodeTranscriptSupportV10.tab.loaded mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTranscriptSourceV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTranscriptSource.sql tables/wgEncodeGencodeTranscriptSourceV10.tab Scanning through 1 files mv -f tables/wgEncodeGencodeAnnotationRemarkV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeAnnotationRemarkV10.tab mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodePubMedV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodePubMed.sql tables/wgEncodeGencodePubMedV10.tab touch loaded/wgEncodeGencodeRefSeqV10.tab.loaded mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeAnnotationRemarkV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAnnotationRemark.sql tables/wgEncodeGencodeAnnotationRemarkV10.tab Scanning through 1 files touch loaded/wgEncodeGencodeTranscriptSourceV10.tab.loaded Scanning through 1 files touch loaded/wgEncodeGencodePubMedV10.tab.loaded Scanning through 1 files touch loaded/wgEncodeGencodeAnnotationRemarkV10.tab.loaded Scanning through 1 files mv -f tables/wgEncodeGencodeUniProtV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeUniProtV10.tab mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeUniProtV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeUniProt.sql tables/wgEncodeGencodeUniProtV10.tab touch loaded/wgEncodeGencodePdbV10.tab.loaded touch loaded/wgEncodeGencode2wayConsPseudoV10.genePred.loaded Scanning through 1 files touch loaded/wgEncodeGencodeUniProtV10.tab.loaded mv -f tables/wgEncodeGencodePolyaV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodePolyaV10.gp flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePolyaV10 tables/wgEncodeGencodePolyaV10.gp touch loaded/wgEncodeGencodePolyaV10.genePredExt.loaded mv -f data/gencodeManAuto.gp.hgwdev.22170.tmp data/gencodeManAuto.gp mv -f data/gencodeManAuto.tsv.hgwdev.22170.tmp data/gencodeManAuto.tsv gencodeMakeTracks --excludeChrom=chrM $(echo Basic | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeBasicV10.gp.hgwdev.22170.tmp gencodeMakeTracks --excludeChrom=chrM $(echo Comp | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeCompV10.gp.hgwdev.22170.tmp gencodeMakeTracks --excludeChrom=chrM $(echo PseudoGene | tr A-Z a-z) data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodePseudoGeneV10.gp.hgwdev.22170.tmp gencodeMakeAttrs --excludeChrom=chrM data/gencodeManAuto.gp data/gencodeManAuto.tsv tables/wgEncodeGencodeAttrsV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeTagV10.tab mv -f tables/wgEncodeGencodePseudoGeneV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodePseudoGeneV10.gp flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodePseudoGeneV10 tables/wgEncodeGencodePseudoGeneV10.gp touch loaded/wgEncodeGencodePseudoGeneV10.genePredExt.loaded mv -f tables/wgEncodeGencodeCompV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodeCompV10.gp flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeCompV10 tables/wgEncodeGencodeCompV10.gp mv -f tables/wgEncodeGencodeBasicV10.gp.hgwdev.22170.tmp tables/wgEncodeGencodeBasicV10.gp flock load.lock hgLoadGenePred -genePredExt hg19 wgEncodeGencodeBasicV10 tables/wgEncodeGencodeBasicV10.gp mv -f tables/wgEncodeGencodeAttrsV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeAttrsV10.tab mkdir -p loaded/ mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeAttrsV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeAttrs.sql tables/wgEncodeGencodeAttrsV10.tab flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeTagV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeTag.sql tables/wgEncodeGencodeTagV10.tab touch loaded/wgEncodeGencodeCompV10.genePredExt.loaded touch loaded/wgEncodeGencodeBasicV10.genePredExt.loaded Scanning through 1 files touch loaded/wgEncodeGencodeAttrsV10.tab.loaded mkdir -p check/ mkdir -p check/ hgsql -Ne 'select geneId from wgEncodeGencodeAttrsV10 where geneId not in (select geneId from wgEncodeGencodeGeneSourceV10)' hg19 | sort -u >check/wgEncodeGencodeGeneSourceV10.missing hgsql -Ne 'select transcriptId from wgEncodeGencodeAttrsV10 where transcriptId not in (select transcriptId from wgEncodeGencodeTranscriptSourceV10)' hg19 | sort -u >check/wgEncodeGencodeTranscriptSourceV10.missing Scanning through 1 files touch loaded/wgEncodeGencodeTagV10.tab.loaded touch check/wgEncodeGencodeGeneSourceV10.checked touch check/wgEncodeGencodeTranscriptSourceV10.checked mv -f tables/wgEncodeGencodeExonSupportV10.tab.hgwdev.22170.tmp tables/wgEncodeGencodeExonSupportV10.tab mkdir -p loaded/ flock load.lock hgLoadSqlTab hg19 wgEncodeGencodeExonSupportV10 /cluster/home/markd/compbio/browser/gencode/kent/src/hg/lib/encode/wgEncodeGencodeExonSupport.sql tables/wgEncodeGencodeExonSupportV10.tab Scanning through 1 files touch loaded/wgEncodeGencodeExonSupportV10.tab.loaded real 46m55.757s user 48m56.944s sys 0m33.582s ------------------------------------------------------------------------------