### gencode.txt wrangler notes for all submissions for "ENCODE Gencode" select id,name,status from projects where name like "Gencode%"; +-----+---------------------+-----------+ | id | name | status | +-----+---------------------+-----------+ | 21 | GENCODE_08_10_01 | displayed | | 265 | Gencode_2009_01_01 | displayed | +-----+---------------------+-----------+ 2 rows in set (0.00 sec) Sanger Gencode ============== initial encode Gencode Genes desc encodeGencodeGeneClassMar07; +-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+------------------+-------+ | Field | Type | Null | Key | Default | Extra | +-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+------------------+-------+ | name | varchar(255) | | PRI | | | | class | enum('Novel_transcript','Novel_transcript_gencode_conf','Artifact','Known','Novel_CDS', 'Novel_transcript','Novel_transcript_gencode_conf','Putative','Putative_gencode_conf','TEC', 'Processed_pseudogene','Unprocessed_pseudogene','Polymorphic') | | | Novel_transcript | | mysql> select * from encodeGencodeGeneClassMar07 limit 2; +----------------+----------------------+ | name | class | +----------------+----------------------+ | AC000059.1-001 | Processed_pseudogene | | AC000061.1-001 | Known | +----------------+----------------------+ WG encode Gencode Genes select * from wgEncodeSangerGencodeGencodeAuto20081001 limit 2; +-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+ | bin | name | chrom | strand | txStart | txEnd | cdsStart | cdsEnd | exonCount | exonStarts | exonEnds | score | name2 | cdsStartStat | cdsEndStat | exonFrames | +-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+ | 585 | ENST00000382784 | chr11 | - | 117925 | 119388 | 117925 | 119388 | 2 | 117925,119059, | 118376,119388, | 0 | ENSG00000206082 | cmpl | cmpl | 2,0, | | 585 | ENST00000382782 | chr11 | + | 118170 | 119275 | 118170 | 119275 | 6 | 118170,118244,119031,119092,119174,119246, | 118242,118349,119088,119170,119235,119275, | 0 | ENSG00000206080 | incmpl | incmpl | 0,0,0,0,0,1, | +-----+-----------------+-------+--------+---------+--------+----------+--------+-----------+--------------------------------------------+--------------------------------------------+-------+-----------------+--------------+------------+--------------+ cd /usr/local/apache/htdocs/goldenPath/hg18/wgEncodeSangerGencode zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep ENST00000382784 chr11 ENSEMBL exon 119060 119388 . - . gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; chr11 ENSEMBL CDS 119060 119388 . - 0 gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; chr11 ENSEMBL start_codon 119386 119388 . - 0 gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; chr11 ENSEMBL exon 117926 118376 . - . gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; chr11 ENSEMBL CDS 117929 118376 . - 1 gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; chr11 ENSEMBL stop_codon 117926 117928 . - 0 gene_id "ENSG00000206082"; transcript_id "ENST00000382784"; transcript_type "protein_coding"; transcript_status "NOVEL"; gene_type "protein_coding"; gene_status "NOVEL"; level 3; Uniformity of files? zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | wc -l 259464 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_type" | wc -l 259459 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_status" | wc -l 259459 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_type" | wc -l 259459 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_status" | wc -l 259459 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "gene_id" | wc -l 259459 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "transcript_id" | wc -l 259459 zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep gene_name | wc -l 249356 [hgwdev:tdreszer wgEncodeSangerGencode> zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep transcript_name | wc -l 243875 wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz 259464 lines, 5 without any of the 6 gene-id,gene-type,gene_status,transcript_id,transcript_type,transcript_status, 249356 lines with gene_name 243875 lines with transcript_name wgEncodeSangerGencodeGencodeManual20081001.gtf.gz 734750 lines, 5 without any of the 8 gene-id,gene-type,gene_status,gene_name,transcript_id,transcript_type,transcript_status,transcript_name Missing 5 due to header: ##description: evidence-based annotation of the human genome (NCBI36) ##provider: GENCODE ##contact: fsk@sanger.ac.uk ##format: gtf 2.2 ##date: 2008-10-02 What do we need? A) gene_name or else transcript_name or else gene_id or else transcript_id in NAME2? B) itemClassTable (like encodeGencodeGeneClassMar) to tie gencode_genes.NAME to itemClassTable.name and class as ? 'Known', gene_status else transcript_status = "KNOWN" 'Novel_transcript', gene_status else transcript_status = "NOVEL" 'Novel_transcript_gencode_conf', 'Artifact', 'Novel_CDS', 'Putative', ??? No "putative" in submission files 'Putative_gencode_conf', ??? 'TEC', gene_type else transcript_type = "TEC" 'Processed_pseudogene', gene_type else transcript_type = "pseudogene" !!! There are qualifier: "scRNA_pseudogene" 'Unprocessed_pseudogene', gene_type else transcript_type = "unprocessed_pseudogene" 'Polymorphic' gene_type = "polymorphic_pseudogene" transcript Streamline the file first: #1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 chr1 HAVANA exon 13087744 13088030 . - . gene_id "OTTHUMG00000009501"; transcript_id "OTTHUMT00000026267"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "UNKNOWN"; gene_type "polymorphic_pseudogene"; gene_status "NOVEL"; gene_name "RP13-221M14.3"; transcript_name "RP13-221M14.3-001"; level 2; # h,e,loc,str,gi,ti,gn,tn,gt,tt,gs,ts,lvl zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | sed "s/\"//g" | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s:%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n",$2,$3,$1,$4,$5,$7,$10,$12,$22,$24,$18,$14,$20,$16,$26;}' > wgEncodeSangerGencodeGencodeManual20081001.stream.tab wc -l wgEncodeSangerGencodeGencodeManual20081001.stream.tab: 734745 uniq -f 4 wgEncodeSangerGencodeGencodeManual20081001.stream.tab > wgEncodeSangerGencodeGencodeManual20081001.uniq.tab wc -l wgEncodeSangerGencodeGencodeManual20081001.uniq.tab 67432 select count(*) from wgEncodeSangerGencodeGencodeManual20081001; 67432 === OR: === zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | sed "s/\"//g" | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n",$10,$12,$22,$24,$18,$14,$20,$16,$26;}' | uniq > wgEncodeSangerGencodeGencodeManual20081001.stream.tab * * * * * * Manual is uniform, but auto has exceptions: zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep -v gene_name | grep -v transcript_name | wc -l 10108 zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep -v gene_name | grep transcript_name | wc -l 0 zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep gene_name | grep -v transcript_name | wc -l 5481 * * * * * * Load uniq classes tables zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | grep "level 3" | sed "s/\"//g" | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t3\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t3\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n", $10,$12,$18,$14,$20,$16;}' > ggL3.classes.tab wc -l ggL3.classes.tab 259459 zcat wgEncodeSangerGencodeGencodeAuto20081001.gtf.gz | wc -l 259464 uniq ggL3Auto.classes.tab > ggL3.uniq.tab wc -l ggL3.uniq.tab 6293 zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | grep "level 1" | sed "s/\"//g" | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t1\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t1\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n", $10,$12,$18,$14,$20,$16;}' > ggL1.classes.tab zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | grep "level 2" | sed "s/\"//g" | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t2\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t2\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n", $10,$12,$18,$14,$20,$16;}' > ggL2.classes.tab zcat wgEncodeSangerGencodeGencodeManual20081001.gtf.gz | wc -l 734750 wc -l ggL1.classes.tab 4563 wc -l ggL2.classes.tab 730182 730182 + 4563 + 5 = 734750 uniq ggL1.classes.tab > ggL1.uniq.tab uniq ggL2.classes.tab > ggL2.uniq.tab wc -l ggL1.uniq.tab 3638 wc -l ggL2.uniq.tab 63794 63794 + 3638 = 67432 cp ggL1.uniq.tab ggMan.uniq.tab cat ggL2.uniq.tab >> ggMan.uniq.tab >ENSEMBL exon chr11:566486-566592 + ENSG00000070047 ENST00000264555 K1542_HUMAN K1542_HUMAN protein_coding protein_coding KNOWN KNOWN 3 Plan: A) load uniq.tabs into hg18 temporarily. echo "CREATE TABLE wgEncodeSangerGencodeGeneClasses_tmp ( geneId varchar(255) not null, transcriptId varchar(255) not null, geneName varchar(255) not null, transcriptName varchar(255) not null, geneType varchar(255) not null, transcriptType varchar(255) not null, geneStatus varchar(255) not null, transcriptStatus varchar(255) not null, level integer, class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined', INDEX(transcriptName), INDEX(class,level,transcriptType), PRIMARY KEY(transcriptId));" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'ggL3.uniq.tab' into table wgEncodeSangerGencodeGeneClasses_tmp" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'ggMan.uniq.tab' into table wgEncodeSangerGencodeGeneClasses_tmp" | hgsql hg18 select count(*) from wgEncodeSangerGencodeGencodeAuto20081001; | 16293 | select count(*) from wgEncodeSangerGencodeGencodeManual20081001; | 67432 | select count(*) from wgEncodeSangerGencodeGeneClasses_tmp; | 83725 | B) Update the 2 gpf tracks. Currently NAME = transcriptId and NAME2 = gene_id; What do we want? gn > tn > gi > ti ? update wgEncodeSangerGencodeGencodeAuto20081001, wgEncodeSangerGencodeGeneClasses_tmp set name2 = wgEncodeSangerGencodeGeneClasses_tmp.gene_name where wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId and wgEncodeSangerGencodeGeneClasses_tmp.gene_name != ""; update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = wgEncodeSangerGencodeGeneClasses_tmp.gene_name where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; select count(*) from wgEncodeSangerGencodeGencodeManual20081001; 67432 select count(*) from wgEncodeSangerGencodeGeneClasses_tmp,wgEncodeSangerGencodeGencodeManual20081001 where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 67432 update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name where wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 67432 select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeManual20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name and wgEncodeSangerGencodeGencodeManual20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 67432 select count(*) from wgEncodeSangerGencodeGencodeManual20081001 where name = ""; 0 select count(*) from wgEncodeSangerGencodeGencodeAuto20081001; 16293 select count(*) from tmpGencodeGeneAuto_tab,wgEncodeSangerGencodeGencodeAuto20081001 where wgEncodeSangerGencodeGencodeAuto20081001.name = tmpGencodeGeneAuto_tab.transcriptId; 16293 update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId where wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name where wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId and wgEncodeSangerGencodeGeneClasses_tmp.transcript_name != ""; select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 16293 select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcript_name and wgEncodeSangerGencodeGencodeAuto20081001.name2 = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 11752 select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where wgEncodeSangerGencodeGencodeAuto20081001.name = wgEncodeSangerGencodeGeneClasses_tmp.transcriptId; 4541 Try again: want name=transcriptId, name2=transcriptName Currently NAME = transcriptId and NAME2 = gene_id; What do we want? gn > tn > gi > ti ? select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name2 = transcriptId; 16293 update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set name = transcriptId where name2 = transcriptId; update wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = transcriptName where name2 = transcriptId and transcriptName != ""; select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId; 16293 select count(*) from wgEncodeSangerGencodeGencodeAuto20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId and name2 != transcriptName;4541 select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name2 = transcriptId; 67432 update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name = transcriptId where name2 = transcriptId; update wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp set name2 = transcriptName where name2 = transcriptId and transcriptName != ""; select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId; 67432 select count(*) from wgEncodeSangerGencodeGencodeManual20081001,wgEncodeSangerGencodeGeneClasses_tmp where name = transcriptId and name2 != transcriptName;0 C) Class is very tricky Name Color Contents ---- ----- -------- Validated_coding Dark Yellow protein_coding/level_1 Validated_processed Light Yellow processed_transcript/level_1 Validated_processed_pseudogene Dark Purple processed_pseudogene/level_1, transcribed_processed_pseudogene/level_1 Validated_unprocessed_pseudogene Medium Purple unitary_processed_pseudogene/level_1, unprocessed_pseudogenne/level_1 Validated_pseudogene Light Purple IG_pseudogene/level_1, pseudogene/level_1, transcribed_pseudogene/level_1 Havana_coding Dark Orange protein_coding/level_2, IG_gene/level_2 Havana_nonsense Medium Orange nonsense_mediated_decay/level_2 Havana_non_coding Light Orange ambiguous_orf/level_2, antisense/level_2, non_coding/level_2, retained_intron/level_2, processed_transcript/level_2 Havana_processed_pseudogene Dark Pink processed_pseudogene/level_2, transcribed_processed_pseudogene/level_2, transcribed_pseudogene/level_2 Havana_unprocessed_pseudogene Medium Pink polymorphic_pseudogene/level_2, transcribed_unprocessed_pseudogene/level_2, unitary_pseudogene/level_2, unprocessed_pseudogene/level_2 Havana_pseudogene Light Pink pseudogene/level_2, IG_pseudogene/level_2, Havana_TEC Grey TEC/level_2, artifact/level_2 Havana_polyA Black polyA features Ensembl_coding Dark Red protein_coding/level_3, IG_protein/level_3, C_segment/level_3, J_segment/level_3, V_segment/level_3 Ensembl_RNA Light Red Mt_tRNA_pseudogene/level3, miRNA/level_3, miRNA_pseudogene/level_3, misc_RNA/level_3, misc_RNA_pseudogene/level_3, rRNA/level_3, rRNA_pseudogene/level_3, scRNA/level_3, scRNA_pseudogene/level_3, snRNA/level_3, snRNA_pseudogene/level_3, snoRNA/level_3, snoRNA_pseudogene/level_3, tRNA_pseudogene/level_3, Ensembl_pseudogene Dark Pink pseudogene/level_3, retrotransposon/level_3 select distinct level,transcriptType from wgEncodeSangerGencodeGeneClasses_tmp order by level,transcriptType; +-------+------------------------------------+ | level | transcriptType | +-------+------------------------------------+ | 1 | IG_pseudogene | Validated_pseudogene | 1 | processed_pseudogene | Validated_processed_pseudogene | 1 | pseudogene | Validated_pseudogene | 1 | transcribed_processed_pseudogene | Validated_processed_pseudogene | 1 | transcribed_pseudogene | Validated_pseudogene | 1 | unitary_pseudogene | Validated_unprocessed_pseudogene | 1 | unprocessed_pseudogene | Validated_unprocessed_pseudogene | 2 | ambiguous_orf | Havana_non_coding | 2 | antisense | Havana_non_coding | 2 | artifact | Havana_TEC | 2 | IG_gene | Havana_coding | 2 | IG_pseudogene | Havana_pseudogene | 2 | nonsense_mediated_decay | Havana_nonsense | 2 | non_coding | Havana_non_coding | 2 | polymorphic_pseudogene | Havana_unprocessed_pseudogene | 2 | processed_pseudogene | Havana_processed_pseudogene | 2 | processed_transcript | Havana_non_coding | 2 | protein_coding | Havana_coding | 2 | pseudogene | Havana_pseudogene | 2 | retained_intron | Havana_non_coding | 2 | TEC | Havana_TEC | 2 | transcribed_processed_pseudogene | Havana_processed_pseudogene | 2 | transcribed_pseudogene | Havana_processed_pseudogene | 2 | transcribed_unprocessed_pseudogene | Havana_unprocessed_pseudogene | 2 | unitary_pseudogene | Havana_unprocessed_pseudogene | 2 | unprocessed_pseudogene | Havana_unprocessed_pseudogene | 3 | C_segment | Ensembl_coding | 3 | J_segment | Ensembl_coding | 3 | miRNA | Ensembl_RNA | 3 | miRNA_pseudogene | Ensembl_RNA | 3 | misc_RNA | Ensembl_RNA | 3 | misc_RNA_pseudogene | Ensembl_RNA | 3 | Mt_tRNA_pseudogene | Ensembl_RNA | 3 | protein_coding | Ensembl_coding | 3 | pseudogene | Ensembl_pseudogene | 3 | retrotransposed | Ensembl_pseudogene | 3 | rRNA | Ensembl_RNA | 3 | rRNA_pseudogene | Ensembl_RNA | 3 | scRNA | Ensembl_RNA | 3 | scRNA_pseudogene | Ensembl_RNA | 3 | snoRNA | Ensembl_RNA | 3 | snoRNA_pseudogene | Ensembl_RNA | 3 | snRNA | Ensembl_RNA | 3 | snRNA_pseudogene | Ensembl_RNA | 3 | tRNA_pseudogene | Ensembl_RNA | 3 | V_segment | Ensembl_coding +-------+------------------------------------+ 46 rows in set (0.34 sec) alter table wgEncodeSangerGencodeGeneClasses_tmp add column class enum('Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene','Undefined') not null; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Undefined'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'IG_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_pseudogene' where level = 1 and transcriptType = 'transcribed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_processed_pseudogene' where level = 1 and transcriptType = 'processed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_processed_pseudogene' where level = 1 and transcriptType = 'transcribed_processed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_unprocessed_pseudogene' where level = 1 and transcriptType = 'unitary_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Validated_unprocessed_pseudogene' where level = 1 and transcriptType = 'unprocessed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_coding' where level = 2 and transcriptType = 'IG_gene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_coding' where level = 2 and transcriptType = 'protein_coding'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'ambiguous_orf'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'antisense'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'non_coding'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'retained_intron'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_non_coding' where level = 2 and transcriptType = 'processed_transcript'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_nonsense' where level = 2 and transcriptType = 'nonsense_mediated_decay'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_pseudogene' where level = 2 and transcriptType = 'IG_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_pseudogene' where level = 2 and transcriptType = 'pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'processed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'transcribed_processed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_processed_pseudogene' where level = 2 and transcriptType = 'transcribed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'polymorphic_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'transcribed_unprocessed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'unitary_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_unprocessed_pseudogene' where level = 2 and transcriptType = 'unprocessed_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_TEC' where level = 2 and transcriptType = 'artifact'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Havana_TEC' where level = 2 and transcriptType = 'TEC'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'C_segment'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'J_segment'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'protein_coding'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_coding' where level = 3 and transcriptType = 'V_segment'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'miRNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'miRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'misc_RNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'misc_RNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'Mt_tRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'rRNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'rRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'scRNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'scRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snoRNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snoRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snRNA'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'snRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_RNA' where level = 3 and transcriptType = 'tRNA_pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_pseudogene' where level = 3 and transcriptType = 'pseudogene'; update wgEncodeSangerGencodeGeneClasses_tmp set class = 'Ensembl_pseudogene' where level = 3 and transcriptType = 'retrotransposed'; select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where class = 'Undefined'; select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where class = 'Havana_processed_pseudogene'; | 1295 | select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'processed_pseudogene'; | 1238 | select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'transcribed_processed_pseudogene'; | 7 | select count(*) from wgEncodeSangerGencodeGeneClasses_tmp where level = 2 and transcriptType = 'transcribed_pseudogene'; | 50 | select distinct class,level,transcriptType from wgEncodeSangerGencodeGeneClasses_tmp order by class,level,transcriptType; +----------------------------------+-------+------------------------------------+ | class | level | transcriptType | +----------------------------------+-------+------------------------------------+ | Validated_processed_pseudogene | 1 | processed_pseudogene | | Validated_processed_pseudogene | 1 | transcribed_processed_pseudogene | | Validated_unprocessed_pseudogene | 1 | unitary_pseudogene | | Validated_unprocessed_pseudogene | 1 | unprocessed_pseudogene | | Validated_pseudogene | 1 | IG_pseudogene | | Validated_pseudogene | 1 | pseudogene | | Validated_pseudogene | 1 | transcribed_pseudogene | | Havana_coding | 2 | IG_gene | | Havana_coding | 2 | protein_coding | | Havana_nonsense | 2 | nonsense_mediated_decay | | Havana_non_coding | 2 | ambiguous_orf | | Havana_non_coding | 2 | antisense | | Havana_non_coding | 2 | non_coding | | Havana_non_coding | 2 | processed_transcript | | Havana_non_coding | 2 | retained_intron | | Havana_processed_pseudogene | 2 | processed_pseudogene | | Havana_processed_pseudogene | 2 | transcribed_processed_pseudogene | | Havana_processed_pseudogene | 2 | transcribed_pseudogene | | Havana_unprocessed_pseudogene | 2 | polymorphic_pseudogene | | Havana_unprocessed_pseudogene | 2 | transcribed_unprocessed_pseudogene | | Havana_unprocessed_pseudogene | 2 | unitary_pseudogene | | Havana_unprocessed_pseudogene | 2 | unprocessed_pseudogene | | Havana_pseudogene | 2 | IG_pseudogene | | Havana_pseudogene | 2 | pseudogene | | Havana_TEC | 2 | artifact | | Havana_TEC | 2 | TEC | | Ensembl_coding | 3 | C_segment | | Ensembl_coding | 3 | J_segment | | Ensembl_coding | 3 | protein_coding | | Ensembl_coding | 3 | V_segment | | Ensembl_RNA | 3 | miRNA | | Ensembl_RNA | 3 | miRNA_pseudogene | | Ensembl_RNA | 3 | misc_RNA | | Ensembl_RNA | 3 | misc_RNA_pseudogene | | Ensembl_RNA | 3 | Mt_tRNA_pseudogene | | Ensembl_RNA | 3 | rRNA | | Ensembl_RNA | 3 | rRNA_pseudogene | | Ensembl_RNA | 3 | scRNA | | Ensembl_RNA | 3 | scRNA_pseudogene | | Ensembl_RNA | 3 | snoRNA | | Ensembl_RNA | 3 | snoRNA_pseudogene | | Ensembl_RNA | 3 | snRNA | | Ensembl_RNA | 3 | snRNA_pseudogene | | Ensembl_RNA | 3 | tRNA_pseudogene | | Ensembl_pseudogene | 3 | pseudogene | | Ensembl_pseudogene | 3 | retrotransposed | +----------------------------------+-------+------------------------------------+ 46 rows in set (0.11 sec) rename table wgEncodeSangerGencodeGeneClasses_tmp to wgEncodeSangerGencodeClasses; alter table wgEncodeSangerGencodeClasses add column name varchar(255); update wgEncodeSangerGencodeClasses set name = transcriptId; alter table wgEncodeSangerGencodeClasses drop primary key; alter table wgEncodeSangerGencodeClasses add key(name) primary; rename table wgEncodeSangerGencodeClasses to wgEncodeGencodeClasses; GENCODE Round 2 =============== A) Split gtf into header, lvls 1,2,3 head -5 gencode_data.rel2.gtf > gencode.rel2.header.gtf grep "level 1" gencode_data.rel2.gtf > gencode.rel2.lvl1.gtf grep "level 2" gencode_data.rel2.gtf > gencode.rel2.lvl2.gtf grep "level 3" gencode_data.rel2.gtf > gencode.rel2.lvl3.gtf wc -l gencode_data.rel2.gtf wc -l gencode.rel2.header.gtf wc -l gencode.rel2.lvl1.gtf wc -l gencode.rel2.lvl2.gtf wc -l gencode.rel2.lvl3.gtf 1238932 gencode_data.rel2.gtf 5 gencode.rel2.header.gtf 12617 gencode.rel2.lvl1.gtf 954754 gencode.rel2.lvl2.gtf 248892 gencode.rel2.lvl3.gtf 5 + 12617 + 954754 + 248892 = 1216268 - 1238932 = -22664 Missing! grep -v "##" gencode_data.rel2.gtf | grep -v "level" > gencode.rel2.missing.gtf wc -l gencode.rel2.missing.gtf 22664 gencode.rel2.missing.gtf head gencode.rel2.missing.gtf chr1 HAVANA polyA_signal 131340 131345 . - . . chr1 HAVANA pseudo_polyA 218144 218149 . - . . grep -v "HAVANA" gencode.rel2.missing.gtf | wc -l 0 grep -v "olyA" gencode.rel2.missing.gtf | wc -l 0 mv gencode.rel2.missing.gtf gencode.rel2.HavanaPolyA.gtf cp gencode.rel2.header.gtf gencode.rel2.manual.gtf cat gencode.rel2.lvl1.gtf >> gencode.rel2.manual.gtf cat gencode.rel2.lvl2.gtf >> gencode.rel2.manual.gtf cat gencode.rel2.HavanaPolyA.gtf >> gencode.rel2.manual.gtf cp gencode.rel2.header.gtf gencode.rel2.auto.gtf cat gencode.rel2.lvl1.gtf >> gencode.rel2.auto.gtf cp gencode.rel2.header.gtf gencode.rel2.manual_noPolyA.gtf cat gencode.rel2.lvl1.gtf >> gencode.rel2.manual_noPolyA.gtf cat gencode.rel2.lvl2.gtf >> gencode.rel2.manual_noPolyA.gtf B) Doctor missing transcript_ids C) format uniq classes sed "s/\"//g" gencode.rel2.lvl3.gtf | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t3\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t3\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n", $10,$12,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t3\n", $10,$10,$12,$12,$14,$14; else printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl3.tab wc -l gencode.rel2.lvl3.gtf wc -l encode.gencode.classes.lvl3.tab grep "EXTERMINATE" encode.gencode.classes.lvl3.tab | wc -l 248892 gencode.rel2.lvl3.gtf 248892 encode.gencode.classes.lvl3.tab 0 uniq encode.gencode.classes.lvl3.tab > encode.gencode.classes.u3.tab wc -l encode.gencode.classes.u3.tab 39695 encode.gencode.classes.u3.tab sed "s/\"//g" gencode.rel2.lvl1.gtf | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t1\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t1\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n", $10,$12,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t1\n", $10,$10,$12,$12,$14,$14; else printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl1.tab wc -l gencode.rel2.lvl1.gtf wc -l encode.gencode.classes.lvl1.tab grep "EXTERMINATE" encode.gencode.classes.lvl1.tab | wc -l 12617 gencode.rel2.lvl1.gtf 12617 encode.gencode.classes.lvl1.tab 0 uniq encode.gencode.classes.lvl1.tab > encode.gencode.classes.u1.tab wc -l encode.gencode.classes.u1.tab 7775 encode.gencode.classes.u1.tab sed "s/\"//g" gencode.rel2.lvl2.gtf | sed "s/;//g" | awk '{if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name" && $23 == "transcript_name") printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t2\n",$10,$12,$22,$24,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status" && $21 == "gene_name") printf "%s\t%s\t%s\t\t%s\t%s\t%s\t%s\t2\n", $10,$12,$22,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "transcript_id" && $13 == "transcript_type" && $15 == "transcript_status" && $17 == "gene_type" && $19 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n", $10,$12,$18,$14,$20,$16; else if($9 == "gene_id" && $11 == "gene_type" && $13 == "gene_status") printf "%s\t%s\t\t\t%s\t%s\t%s\t%s\t2\n", $10,$10,$12,$12,$14,$14; else printf "EXTERMINATE\t%s\n",$0}' > encode.gencode.classes.lvl2.tab wc -l gencode.rel2.lvl2.gtf wc -l encode.gencode.classes.lvl2.tab grep "EXTERMINATE" encode.gencode.classes.lvl2.tab | wc -l 954754 gencode.rel2.lvl2.gtf 954754 encode.gencode.classes.lvl2.tab 0 uniq encode.gencode.classes.lvl2.tab > encode.gencode.classes.u2.tab wc -l encode.gencode.classes.u2.tab 100681 encode.gencode.classes.u2.tab D) tar rm gencode_rel2.tgz tar -cpzf gencode_rel2.tgz * E) Submit F) load uniq.tabs into hg18 temporarily. echo "CREATE TABLE wgEncodeSangerGencodeGeneClasses_tmp ( geneId varchar(255) not null, transcriptId varchar(255) not null, geneName varchar(255) not null, transcriptName varchar(255) not null, geneType varchar(255) not null, transcriptType varchar(255) not null, geneStatus varchar(255) not null, transcriptStatus varchar(255) not null, level integer, class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined', INDEX(transcriptName), INDEX(class,level,transcriptType), PRIMARY KEY(transcriptId));" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u1.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u2.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.u3.tab' into table wgEncodeSangerGencodeGeneClasses20090101" | hgsql hg18 select count(*) from wgEncodeSangerGencodeGencodeAuto20090101; | 16293 | select count(*) from wgEncodeSangerGencodeGencodeManual20090101; | 67432 | select count(*) from wgEncodeSangerGencodeGeneClasses_tmp; | 83725 | GENCODE tRNAs and polyAs 2009-04-07 =================================== /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_polyAs.rel2.gtf /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_tRNAscans.rel2.gtf pip cd 265 cp /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_polyAs.rel2.gtf . cp /cse/staff/tdreszer/docs/ENCODE/gencode/gencode_tRNAscans.rel2.gtf . create table wgEncodeTmpGencodeAutoRel2 select * from wgEncodeGencodeAutoRel2; /cluster/bin/x86_64/ldHgGene -gtf -genePredExt -noncoding -oldTable hg18 wgEncodeTmpGencodeAutoRel2 gencode_tRNAscans.rel2.gtf > load_tRnas.out 2>&1 & [hgwdev:tdreszer 265> Reading gencode_tRNAscans.rel2.gtf Read 623 transcripts in 623 lines in 1 files 623 groups 25 seqs 1 sources 1 feature types 0 gene predictions /cluster/bin/x86_64/ldHgGene -gtf -genePredExt -noncoding hg18 wgEncodeGencodePolyaRel2 gencode_polyAs.rel2.gtf > load_polyAs.out 2>&1 & /cluster/bin/x86_64/ldHgGene -genePredExt -noncoding hg18 wgEncodeGencodePolyaRel2 gencode_polyAs.rel2.gtf > load_polyAs.out 2>&1 & /cluster/bin/x86_64/ldHgGene ldHgGene - load database with gene predictions from a gff file. usage: ldHgGene database table file(s).gff options: -bin Add bin column (now the default) -nobin don't add binning (you probably don't want this) -exon=type Sets type field for exons to specific value -oldTable Don't overwrite what's already in table -noncoding Forces whole prediction to be UTR -gtf input is GTF, stop codon is not in CDS -predTab input is already in genePredTab format -requireCDS discard genes that don't have CDS annotation -out=gpfile write output, in genePred format, instead of loading table. Database is ignored. -genePredExt create a extended genePred, including frame information and gene name -impliedStopAfterCds - implied stop codon in GFF/GTF after CDS head gencode_tRNAscans.rel2.gtf chr1 HAVANA tRNAscan 7912926 7912995 . - . gene_id 199079; transcript_id 199079; genename "Pseudo"; transcriptname "Pseudo";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3; chr1 HAVANA tRNAscan 16719667 16719740 . - . gene_id 199126; transcript_id 199126; genename "Asn"; transcriptname "Asn";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3; chr1 HAVANA tRNAscan 16731480 16731553 . - . gene_id 199125; transcript_id 199125; genename "Asn"; transcriptname "Asn";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3; chr1 HAVANA tRNAscan 16734361 16734432 . - . gene_id 199124; transcript_id 199124; genename "Glu"; transcriptname "Glu";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3; chr1 HAVANA tRNAscan 16745021 16745091 . - . gene_id 199123; transcript_id 199123; genename "Gly"; transcriptname "Gly";; transcript_type "tRNAscan"; transcript_status "NOVEL"; gene_type "tRNAscan"; gene_status "NOVEL"; level 3; head gencode_polyAs.rel2.gtf chr1 HAVANA polyA_site 131340 131345 . - . gene_id 418719; transcript_id 418719; chr1 HAVANA pseudo_polyA 218144 218149 . - . gene_id 418720; transcript_id 418720; chr1 HAVANA polyA_signal 443706 443711 . - . gene_id 418722; transcript_id 418722; chr1 HAVANA polyA_signal 519796 519801 . + . gene_id 418723; transcript_id 418723; chr1 HAVANA polyA_signal 552634 552639 . - . gene_id 418726; transcript_id 418726; /cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeTmpGencodeAutoRel2 gencode_tRNAscans.rel2.gtf > load_tRnas.out 2>&1 & Now the tRNAs are loaded but there are no class records to cover them grep HAVANA gencode_polyAs.rel2.gtf | wc -l 23036 grep polyA_signal gencode_polyAs.rel2.gtf | wc -l 21636 grep pseudo_polyA gencode_polyAs.rel2.gtf | wc -l 1277 grep polyA_site gencode_polyAs.rel2.gtf | wc -l 42 grep -v polyA_signal gencode_polyAs.rel2.gtf | grep -v pseudo_polyA | grep -v polyA_site grep PolyA_signal gencode_polyAs.rel2.gtf | wc -l 79 21636 + 1277 + 42 + 79 = 23034 grep -v polyA_signal gencode_polyAs.rel2.gtf | grep -v pseudo_polyA | grep -v PolyA_signal | grep -v polyA_site "PolyA signal" grep PolyA gencode_polyAs.rel2.gtf | wc -l 81 grep -v PolyA gencode_polyAs.rel2.gtf > gencode.rel2.polyAs.gtf grep PolyA gencode_polyAs.rel2.gtf | sed "s/PolyA/polyA/" | sed "s/polyA signal/polyA_signal/" >> gencode.rel2.polyAs.gtf wc -l gencode.rel2.polyAs.gtf 23041 gencode.rel2.polyAs.gtf wc -l gencode_polyAs.rel2.gtf 23041 gencode_polyAs.rel2.gtf grep -v polyA_signal gencode.rel2.polyAs.gtf | grep -v pseudo_polyA | grep -v polyA_site | grep -v "##" | wc -l 0 So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site head -5 gencode.rel2.polyAs.gtf > gencode.rel2.polyA_signal.gtf grep polyA_signal gencode.rel2.polyAs.gtf >> gencode.rel2.polyA_signal.gtf head -5 gencode.rel2.polyAs.gtf > gencode.rel2.pseudo_polyA.gtf grep pseudo_polyA gencode.rel2.polyAs.gtf >> gencode.rel2.pseudo_polyA.gtf head -5 gencode.rel2.polyAs.gtf > gencode.rel2.polyA_site.gtf grep polyA_site gencode.rel2.polyAs.gtf >> gencode.rel2.polyA_site.gtf /cluster/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.polyA_signal.gtf > load_polyA.out 2>&1 & /cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.pseudo_polyA.gtf >> load_polyA.out 2>&1 & /cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaRel2 gencode.rel2.polyA_site.gtf >> load_polyA.out 2>&1 & okay, it is now all about classes! Consistent naming: rename table wgEncodeTmpGencodeAutoRel2 to wgEncodeGencodeAutoRel2Tmp cp gencode_tRNAscans.rel2.gtf gencode.rel2.tRNAs.gtf Work on tmp table: create table wgEncodeGencodeClassesRel2Tmp select * from wgEncodeGencodeClassesRel2; make tRNA only table to simplify /cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding hg18 wgEncodeGencodeAutoRel2TrnasOnly gencode_tRNAscans.rel2.gtf > load_tRnas2.out 2>&1 & No need drop table wgEncodeGencodeAutoRel2TrnasOnly; desc wgEncodeGencodeClassesRel2Tmp; +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+ | Field | Type | Null | Key | Default | Extra | +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+ | geneId | varchar(255) | | | | | | transcriptId | varchar(255) | | | | | | geneName | varchar(255) | | | | | | transcriptName | varchar(255) | | | | | | geneType | varchar(255) | | | | | | transcriptType | varchar(255) | | | | | | geneStatus | varchar(255) | | | | | | transcriptStatus | varchar(255) | | | | | | level | int(11) | YES | | NULL | | | class | enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') | | | Undefined | | | name | varchar(255) | | | | | +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------+-------+ tail -23036 gencode.rel2.polyAs.gtf | sed "s/\"//g" | sed "s/;//g" | awk '{printf "%s\t%s\t\t\t%s\t%s\tUNKNOWN\tUNKNOWN\t2\tHavana_polyA\t%s\n",$10,$12,$3,$3,$10;}' > encode.gencode.classes.polyA.tab head encode.gencode.classes.polyA.tab tail -623 gencode.rel2.tRNAs.gtf | sed "s/\"//g" | sed "s/;//g" | awk '{printf "%s\t%s\t\t\t%s\t%s\tUNKNOWN\tUNKNOWN\t3\tEnsembl_RNA\t%s\n",$10,$12,$3,$3,$10;}' > encode.gencode.classes.tRNAs.tab head encode.gencode.classes.tRNAs.tab echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.polyA.tab' into table wgEncodeGencodeClassesRel2Tmp" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'encode.gencode.classes.tRNAs.tab' into table wgEncodeGencodeClassesRel2Tmp" | hgsql hg18 rename table wgEncodeGencodeClassesRel2 to wgEncodeGencodeClassesRel2Old; rename table wgEncodeGencodeClassesRel2Tmp to wgEncodeGencodeClassesRel2; rename table wgEncodeGencodeAutoRel2 to wgEncodeGencodeAutoRel2Old; rename table wgEncodeGencodeAutoRel2Tmp to wgEncodeGencodeAutoRel2; select count(*) from wgEncodeGencodeAutoRel2 where chrom NOT in ( "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10", "chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21","chr22","chrX","chrY","chrM"); 42 select * from wgEncodeGencodeAutoRel2 where chrom NOT in ( "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10", "chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21","chr22","chrX","chrY","chrM"); chrMT select count(*) from wgEncodeGencodeAutoRel2 where chrom = "chrM"; 0 update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT"; hgsql -N -B -e "select * from wgEncodeGencodeClassesRel2" hg18 > wgEncodeGencodeClassesRel2.tab TOBEDONE!!!!: 1) Should genePred tables have geneName in name2? Currently they have geneId 2) Classes table is missing geneName and transcriptName!!! Must rebuild 3) Once classes is rebuilt, either: a) Fix ldHgGene to use geneName, not geneId b) Manually update genePred tables with name2 = classes.geneName c) Code browser to look up geneName whenever name2 is used! Changed local ldHgGene ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualNew gencode.rel2.manual_noPolyA.gtf > reload_manual.out 2>&1 & rename table wgEncodeGencodeManualRel2 to wgEncodeGencodeManualRel2Old; rename table wgEncodeGencodeManualNew to wgEncodeGencodeManualRel2; ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoNew gencode.rel2.auto_with_tRNAs.gtf > reload_auto.out 2>&1 & rename table wgEncodeGencodeAutoRel2 to wgEncodeGencodeAutoRel2Old; rename table wgEncodeGencodeAutoNew to wgEncodeGencodeAutoRel2; update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT"; # classes tables doesn't have the right key structure (but Rel1 does) so: alter table wgEncodeGencodeClassesRel2 add primary key name; ### Doesn't work create table wgEncodeGencodeClassesNew like wgEncodeGencodeClassesRel1; ### Doesn't work CREATE TABLE wgEncodeGencodeClassesNew ( geneId varchar(255) not null, transcriptId varchar(255) not null, geneName varchar(255) not null, transcriptName varchar(255) not null, geneType varchar(255) not null, transcriptType varchar(255) not null, geneStatus varchar(255) not null, transcriptStatus varchar(255) not null, level integer, class enum('Undefined','Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene','Havana_coding','Havana_nonsense','Havana_non_coding','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC','Havana_polyA','Ensembl_coding','Ensembl_RNA','Ensembl_pseudogene') not null default 'Undefined', name varchar(255) not null, # indexes INDEX(transcriptType), INDEX(level,class), PRIMARY KEY(name) ); insert into wgEncodeGencodeNew select * from wgEncodeGencodeRel2; rename table wgEncodeGencodeClassesRel2 to wgEncodeGencodeClassesRel2Old; rename table wgEncodeGencodeClassesNew to wgEncodeGencodeClassesRel2; tRNAs appear to be missing!!! 2009-05-05 ======================================== ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeTrnaNew gencode.rel2.tRNAs.gtf > reload_tRNAs.out 2>&1 & select count(*) from wgEncodeGencodeTrnaNew;| 0 | ~/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt hg18 wgEncodeGencodeTrnaNew gencode.rel2.tRNAs.gtf > reload_tRNAs.out 2>&1 & select count(*) from wgEncodeGencodeTrnaNew;| 623 | select count(*) from wgEncodeGencodeTrnaNew t1, wgEncodeGencodeClassesRel2 t2 where t1.name = t2.name;| 623 | # So these transcripts are already in wgEncodeGencodeClassesRel2 insert into wgEncodeGencodeAutoRel2 select * from wgEncodeGencodeTrnaNew; update wgEncodeGencodeAutoRel2 set chrom = "chrM" where chrom = "chrMT"; drop table wgEncodeGencodeTrnaNew; /cluster/data/encode/pipeline/bin/encodeStatus.pl 21 approved /cluster/data/encode/pipeline/bin/encodeStatus.pl 265 approved /cluster/data/encode/pipeline/bin/encodeStatus.pl 21 reviewing /cluster/data/encode/pipeline/bin/encodeStatus.pl 265 reviewing /cluster/data/encode/pipeline/bin/encodeStatus.pl 21 released /cluster/data/encode/pipeline/bin/encodeStatus.pl 265 released Release 3 2009-08-26 ==================== start with cd {pip}/265 mkdir rel3; cd rel3 ftp release_3_DCC.tgz into . tar -tzf release_3_DCC.tgz tar -xzf release_3_DCC.tgz cd to_release wc -l classes.def 68 uniq -u < classes.def | wc -l 68 sort -k 2,3 classes.def | uniq -f 1 | wc -l 19 sort -k 2,3 classes.def > classes_sorted.def sort -k 2,3 classes.def | uniq -f 1 > classes_uniq.def sort -f classes.def > classes_types.def # Use these files to create unique lists of classes, types (and relation between the two) # Figure out what the differences between Rel2 and Rel3 are for classes and types # Classes: Validated_coding: protein_coding Validated_processed Validated_processed_pseudogene: processed_pseudogene,processed_transcript,transcribed_processed_pseudogene Validated_unprocessed_pseudogene: transcribed_unprocessed_pseudogene,unprocessed_pseudogene Validated_pseudogene: IG_pseudogene,polymorphic_pseudogene,pseudogene,retrotransposed,unitary_pseudogene Havana_coding: IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,protein_coding Havana_non_coding: ambiguous_orf,antisense,non_coding,processed_transcript,retained_intron Havana_nonsense: nonsense_mediated_decay Havana_polyA: polyA_signal,polyA_site,pseudo_polyA Havana_processed_pseudogene: processed_pseudogene,transcribed_processed_pseudogene Havana_unprocessed_pseudogene: transcribed_unprocessed_pseudogene,unprocessed_pseudogene Havana_pseudogene: IG_pseudogene,TR_pseudogene,polymorphic_pseudogene,pseudogene,retrotransposed,unitary_pseudogene Havana_TEC: TEC,artifact Ensembl_coding: IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,protein_coding Ensembl_non_coding: antisense,non_coding,processed_transcript,retained_intron Ensembl_processed_pseudogene: processed_pseudogene Ensembl_unprocessed_pseudogene: unprocessed_pseudogene Ensembl_pseudogene: IG_pseudogene,miRNA_pseudogene,misc_RNA_pseudogene,pseudogene,retrotransposed,unitary_pseudogene Ensembl_RNA: Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,miRNA,misc_RNA,rRNA,rRNA_pseudogene,scRNA_pseudogene,snRNA,snRNA_pseudogene,snoRNA,snoRNA_pseudogene,tRNA_pseudogene,tRNAscan # Class List (superset of Rel2): Validated_coding,Validated_processed,Validated_processed_pseudogene,Validated_unprocessed_pseudogene,Validated_pseudogene,Havana_coding,Havana_non_coding,Havana_nonsense,Havana_polyA,Havana_processed_pseudogene,Havana_unprocessed_pseudogene,Havana_pseudogene,Havana_TEC,Ensembl_coding,Ensembl_non_coding,Ensembl_processed_pseudogene,Ensembl_unprocessed_pseudogene,Ensembl_pseudogene,Ensembl_RNA # Types: ambiguous_orf: Havana_non_coding antisense: Havana_non_coding,Ensembl_non_coding artifact: Havana_TEC IG_C_gene: Havana_coding,Ensembl_coding IG_D_gene: Havana_coding,Ensembl_coding/hive/groups/encode/dcc/pipeline/encpipeline_prod/453/ IG_J_gene: Havana_coding,Ensembl_coding IG_pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene IG_V_gene: Havana_coding,Ensembl_coding miRNA: Ensembl_RNA miRNA_pseudogene: Ensembl_pseudogene misc_RNA: Ensembl_RNA misc_RNA_pseudogene: Ensembl_pseudogene Mt_rRNA: Ensembl_RNA Mt_tRNA: Ensembl_RNA Mt_tRNA_pseudogene: Ensembl_RNA nonsense_mediated_decay: Havana_nonsense non_coding: Havana_non_coding,Ensembl_non_coding polyA_signal: Havana_polyA polyA_site: Havana_polyA polymorphic_pseudogene: Validated_pseudogene,Havana_pseudogene processed_pseudogene: Validated_processed_pseudogene,Havana_processed_pseudogene,Ensembl_processed_pseudogene processed_transcript: Validated_processed_pseudogene,Havana_non_coding,Ensembl_non_coding protein_coding: Validated_coding,Havana_coding,Ensembl_coding pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene pseudo_polyA: Havana_polyA retained_intron: Havana_non_coding,Ensembl_non_coding retrotransposed: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene rRNA: Ensembl_RNA rRNA_pseudogene: Ensembl_RNA scRNA_pseudogene: Ensembl_RNA snoRNA: Ensembl_RNA snoRNA_pseudogene: Ensembl_RNA snRNA: Ensembl_RNA snRNA_pseudogene: Ensembl_RNA TEC: Havana_TEC transcribed_processed_pseudogene: Validated_processed_pseudogene,Havana_processed_pseudogene transcribed_unprocessed_pseudogene: Validated_unprocessed_pseudogene,Havana_unprocessed_pseudogene tRNAscan: Ensembl_RNA tRNA_pseudogene: Ensembl_RNA TR_pseudogene: Havana_pseudogene unitary_pseudogene: Validated_pseudogene,Havana_pseudogene,Ensembl_pseudogene unprocessed_pseudogene: Validated_unprocessed_pseudogene,Havana_unprocessed_pseudogene,Ensembl_unprocessed_pseudogene # Type List: ambiguous_orf,antisense,artifact,IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,tRNAscan,tRNA_pseudogene,TR_pseudogene,unitary_pseudogene,unprocessed_pseudogene Rel3: ambiguous_orf,antisense,artifact, IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene, miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene, scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene, tRNAscan,tRNA_pseudogene,TR_pseudogene, unitary_pseudogene,unprocessed_pseudogene Rel2: ambiguous_orf,antisense,artifact,C_segment,IG_gene, IG_pseudogene, J_segment,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene, Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,transcribed_pseudogene,tRNAscan,tRNA_pseudogene, V_segment,unitary_pseudogene,unprocessed_pseudogene # New in Rel3: IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,Mt_rRNA,Mt_tRNA,TR_pseudogene, # Missing in Rel3: C_segment,IG_gene,J_segment,scRNA,transcribed_pseudogene,V_segment, # Make new trackDb setting based upt Rel3: filterBy level:Level=+Validated,Manual_annotation,Automatic_annotation class:Class=Validated_coding,Validated_processed,Validated_processed_pseudogene,Validated_unprocessed_pseudogene,Validated_pseudogene,Havana_coding,Havana_non_coding,Havana_nonsense,Havana_polyA,Havana_processed_pseudogene,Havana_unprocessed_pseudogene,Havana_pseudogene,Havana_TEC,Ensembl_coding,Ensembl_non_coding,Ensembl_processed_pseudogene,Ensembl_unprocessed_pseudogene,Ensembl_pseudogene,Ensembl_RNA transcriptType:Transcript_Type=ambiguous_orf,antisense,artifact,IG_C_gene,IG_D_gene,IG_J_gene,IG_pseudogene,IG_V_gene,miRNA,miRNA_pseudogene,misc_RNA,misc_RNA_pseudogene,Mt_rRNA,Mt_tRNA,Mt_tRNA_pseudogene,nonsense_mediated_decay,non_coding,polyA_signal,polyA_site,polymorphic_pseudogene,processed_pseudogene,processed_transcript,protein_coding,pseudogene,pseudo_polyA,retained_intron,retrotransposed,rRNA,rRNA_pseudogene,scRNA_pseudogene,snoRNA,snoRNA_pseudogene,snRNA,snRNA_pseudogene,TEC,transcribed_processed_pseudogene,transcribed_unprocessed_pseudogene,tRNAscan,tRNA_pseudogene,TR_pseudogene,unitary_pseudogene,unprocessed_pseudogene wc -l: 84824 gencode.v3.level_1_2_annotation.NCBI36.classes 42472 gencode.v3.level_3_annotation.NCBI36.classes 127296 gencode.v3.annotation.NCBI36.classes 23361 gencode.v3.polyAs.NCBI36.classes 621 gencode.v3.tRNAs.NCBI36.classes Make special DAF/DDF in to_release, then tar all tar -cpzf gencode_rel3.tgz * Submit cd ../../../453 m validate_error sh: line 1: 1118 Segmentation fault gtfToGenePred /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.le vel_1_2_annotation.NCBI36.gtf /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/doEncodeValidate.gtf.bed > /hive/groups /encode/dcc/pipeline/encpipeline_prod/453/doEncodeValidate.gtf.err 2>&1 File 'gencode.v3.level_1_2_annotation.NCBI36.gtf' failed GTF validation ~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_1_2_annotation.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1 gtfToGenePred gencode.v3.level_3_annotation.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1 gtfToGenePred gencode.v3.polyAs.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1 ~/bin/x86_64/gtfToGenePred gencode.v3.tRNAs.NCBI36.gtf doEncodeValidate.gtf.bed > doEncodeValidate.gtf.err 2>&1 # Turns out Mark has been making changes to gtfToGenePred but when I compled the latest, running it by hand did not segFault resubmit m validate_error File 'gencode.v3.level_1_2_annotation.NCBI36.gtf' failed GTF validation no exons defined for ENSG00000223972 ~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_1_2_annotation.NCBI36.gtf v3Manual.gtf.bed > v3Manual.gtf.err 2>&1 ~/bin/x86_64/gtfToGenePred -allErrors gencode.v3.level_3_annotation.NCBI36.gtf v3Auto.gtf.bed > v3Auto.gtf.err 2>&1 ~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.polyAs.NCBI36.gtf v3polyAs.gtf.bed > v3polyAs.gtf.err 2>&1 ~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.tRNAs.NCBI36.gtf v3tRNAs.gtf.bed > v3tRNAs.gtf.err 2>&1 84824 v3Manual.gtf.bed 28387 v3Manual.gtf.err 42472 v3Auto.gtf.bed 27353 v3Auto.gtf.err 0 v3polyAs.gtf.bed 23362 v3polyAs.gtf.err 0 v3tRNAs.gtf.bed 622 v3tRNAs.gtf.err ~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.polyAs.NCBI36.gtf v3polyAs.gtf.bed > v3polyAs.gtf.err 2>&1 ~/bin/x86_64/gtfToGenePred -allErrors -impliedStopAfterCds gencode.v3.tRNAs.NCBI36.gtf v3tRNAs.gtf.bed > v3tRNAs.gtf.err 2>&1 # doesn't change anything. What is needed: # For polyAs and tRNAs do not use pipeline. For Manual and Auto, send word to Felix ## Did Rel2 gtf pass gtfToGenePred ? cd ../265 ~/bin/x86_64/gtfToGenePred -allErrors gencode.rel2.manual.gtf test.gtf.bed > test.gtf.err 2>&1 No to manual. Yes to auto. ~/bin/x86_64/gtfToGenePred -allErrors gencode.rel2.manual_noPolyA.gtf test.gtf.bed > test.gtf.err 2>&1 Only "olyA"s fail. So yes Rel2 gtfs were okay by validator At this point Felix agrees to remake the datasets Release 3b 2009-09-03 ===================== cd pip 453 mkdir v3b ftp://ftp.sanger.ac.uk/pub/gencode/gencode.v3b.for_DCC.tgz to v3b mkdir v3 mv gencode.* v3 cd v3b tar -tzf gencode.v3b.for_DCC.tgz tar -xzf gencode.v3b.for_DCC.tgz mv to_release/* .. submit loaded! rename table wgEncodeSangerGencodeGencodeAutoV3 to wgEncodeGencodeAutoV3; rename table wgEncodeSangerGencodeGencodeManualV3 to wgEncodeGencodeManualV3; show tables like "wgEncodeGencode%"; | wgEncodeGencodeAutoRel1 | | wgEncodeGencodeAutoRel2 | | wgEncodeGencodeAutoV3 | | wgEncodeGencodeClassesRel1 | | wgEncodeGencodeClassesRel2 | | wgEncodeGencodeClassesRel2_full | | wgEncodeGencodeClassesRel2_unused | | wgEncodeGencodeManualRel1 | | wgEncodeGencodeManualRel2 | | wgEncodeGencodeManualV3 | | wgEncodeGencodePolyaRel2 | Now the difficult part of loading polyAs, tRNAs and classes create table wgEncodeGencodeAutoV3_tmp select * from wgEncodeGencodeAutoV3; /cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_tmp gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 & Reading gencode.v3.tRNAs.NCBI36.gtf Read 621 transcripts in 621 lines in 1 files 621 groups 24 seqs 1 sources 1 feature types 621 gene predictions rename table wgEncodeGencodeAutoV3 to wgEncodeGencodeAutoV3_noTrnas; rename table wgEncodeGencodeAutoV3_tmp to wgEncodeGencodeAutoV3; grep HAVANA gencode.v3.polyAs.NCBI36.gtf | wc -l 23361 grep polyA_signal gencode.v3.polyAs.NCBI36.gtf | wc -l 21397 grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf | wc -l 1973 grep polyA_site gencode.v3.polyAs.NCBI36.gtf | wc -l 36 grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site header grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v3.polyAs.NCBI36.header.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.signal.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.pseudo.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.site.gtf grep polyA_signal gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.signal.gtf grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.pseudo.gtf grep polyA_site gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.site.gtf So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site # Trick to create empty table create table wgEncodeGencodePolyaV3 select * from wgEncodeGencodePolyaRel2; delete from wgEncodeGencodePolyaV3; ~/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.signal.gtf > load_polyA.out 2>&1 & /cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.pseudo.gtf >> load_polyA.out 2>&1 & /cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.site.gtf >> load_polyA.out 2>&1 & Reading gencode.v3.polyAs.NCBI36.signal.gtf Read 21397 transcripts in 21397 lines in 1 files 21397 groups 24 seqs 1 sources 2 feature types 21364 gene predictions Reading gencode.v3.polyAs.NCBI36.pseudo.gtf Read 1973 transcripts in 1973 lines in 1 files 1973 groups 24 seqs 1 sources 3 feature types 1961 gene predictions Reading gencode.v3.polyAs.NCBI36.site.gtf Read 36 transcripts in 36 lines in 1 files 36 groups 6 seqs 1 sources 1 feature types 36 gene predictions # Time to build the classes table ls -1 *.NCBI36*.classes gencode.v3.polyAs.NCBI36.classes gencode.v3.tRNAs.NCBI36.classes gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.classes head -2 gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes geneId transcriptId transcriptType level Class ENSG00000223972 ENST00000450305 unprocessed_pseudogene 2 Havana_unprocessed_pseudogene ENSG00000227232 ENST00000488147 unprocessed_pseudogene 2 Havana_unprocessed_pseudogene E) Submit F) load uniq.tabs into hg18 temporarily. echo "CREATE TABLE wgEncodeGencodeClassesV3 ( geneId varchar(255) not null, name varchar(255) not null, transcriptType varchar(255) not null, level integer, class enum ('Undefined', 'Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene', 'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC', 'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene','Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA') not null default 'Undefined', # indexes INDEX(transcriptType), INDEX(level,class), PRIMARY KEY(name));" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3.polyAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3.tRNAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 select count(*) from wgEncodeGencodeClassesV3;| 151278 | select count(*) from wgEncodeGencodeAutoV3; | 43093 | + select count(*) from wgEncodeGencodeManualV3; | 84824 | + select count(*) from wgEncodeGencodePolyaV3; | 23325 | = 151242 What are the extra 36? !!! Once agin I fell into the trap: geneId is used as name2 be default but I need geneName!!! # My own copy of ldHgGene uses geneName as name2! create table wgEncodeGencodeManualV3_old select * from wgEncodeGencodeManualV3; delete from wgEncodeGencodeManualV3; ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualV3 gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.gtf > reload_manual.out 2>&1 & Reading gencode.v3b.annotation.NCBI36.level_1_2.no_gene_lines.gtf Read 84824 transcripts in 1116896 lines in 1 files 84824 groups 24 seqs 1 sources 6 feature types 84824 gene predictions | 585 | ENST00000450305 | chr1 | + | 1872 | 3533 | 3533 | 3533 | 6 | 1872,2041,2475,2837,3083,3315, | 1920,2090,2560,2915,3237,3533, | 0 | RP11-34P13.1 | none | none | -1,-1,-1,-1,-1,-1, | ------------ drop table wgEncodeGencodeManualV3_old; create table wgEncodeGencodeAutoV3_noTrnas_old select * from wgEncodeGencodeAutoV3_noTrnas; delete from wgEncodeGencodeAutoV3_noTrnas; create table wgEncodeGencodeAutoV3_Trnas select * from wgEncodeGencodeAutoV3_noTrnas; ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoV3_noTrnas gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.gtf > reload_auto.out 2>&1 & Reading gencode.v3b.annotation.NCBI36.level_3.no_gene_lines.gtf Read 42472 transcripts in 831718 lines in 1 files 42472 groups 25 seqs 1 sources 6 feature types 42472 gene predictions /cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_Trnas gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 & Reading gencode.v3.tRNAs.NCBI36.gtf Read 621 transcripts in 621 lines in 1 files 621 groups 24 seqs 1 sources 1 feature types 621 gene predictions drop table wgEncodeGencodeAutoV3; create table wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_noTrnas; insert into wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_Trnas; select count(*) from wgEncodeGencodeAutoV3;| 43093 | | 585 | ENST00000456328 | chr1 | + | 1736 | 4275 | 4021 | 4252 | 3 | 1736,2475,3083, | 2090,2584,4275, | 0 | RP11-34P13.1 | cmpl | cmpl | -1,-1,0, | ------------ drop table wgEncodeGencodeAutoV3_noTrnas_old; show tables like "wgEncodeGencode%V3%"; +--------------------------------------+ | Tables_in_hg18 (wgEncodeGencode%V3%) | +--------------------------------------+ | wgEncodeGencodeAutoV3 | | wgEncodeGencodeAutoV3_Trnas | | wgEncodeGencodeAutoV3_noTrnas | | wgEncodeGencodeClassesV3 | | wgEncodeGencodeManualV3 | | wgEncodeGencodePolyaV3 | +--------------------------------------+ 6 rows in set (0.09 sec) Now edit trackDb.wgEncode.ra making an alpha and beta version of the wgEncodeSangerGencode track. Add 3 new classes with colors to the track. Because of numerous differences, and because there will soon be an hg19 version, I am creating a new composite called wgEncodeGencode for this release 3. The html will be moved immediately from hg18 to human (but rel2 remains as SangerGencode at hg18). # Spend time massaging metadata # set up search specs for hgFindSpec /cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed ### 2009-10-02 Controversy remains for colors and for OTTER IDs vs. Ensembl Ids. Need to resolve before pushing to QA or having Brian do hg19. 2009-11-02 Release 3c ===================== cd pip 453 mkdir v3c ftp://ftp.sanger.ac.uk/pub/gencode/gencode.v3c.for_DCC.tgz to v3c mkdir v3 mv gencode.* v3 cd v3c tar -tzf gencode.v3c.for_DCC.tgz tar -xzf gencode.v3c.for_DCC.tgz mv forDCC/* .. # edit DAF/DDF to try to include new 2-way track (polyAs, tRNAs and classes still go by hand) # unload 453 (since names are changed, all should be well. submit # 1) Needed dafVersion 1.1 # 2) needed validationSettings (allowReload seems appropriate) # 3) No such luck on 2-way "no exons defined for Overlap1" loaded! #save first: rename table wgEncodeGencodeAutoV3 to wgEncodeGencodeAutoV3b; rename table wgEncodeGencodeManualV3 to wgEncodeGencodeManualV3b; # now put in place: rename table wgEncodeSangerGencodeGencodeAutoV3 to wgEncodeGencodeAutoV3; rename table wgEncodeSangerGencodeGencodeManualV3 to wgEncodeGencodeManualV3; rename table wgEncodeGencodePolyaV3 to wgEncodeGencodePolyaV3b; rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3b; rename table wgEncodeGencodeAutoV3_Trnas to wgEncodeGencodeAutoV3b_Trnas; rename table wgEncodeGencodeAutoV3_noTrnas to wgEncodeGencodeAutoV3b_noTrnas; show tables like "wgEncodeGencode%"; | wgEncodeGencodeAutoRel1 | old: v1 | wgEncodeGencodeAutoRel2 | old: v2 | wgEncodeGencodeAutoV3 | <== New v3c (needs tRNAs) | wgEncodeGencodeAutoV3b | old: v3b | wgEncodeGencodeAutoV3b_Trnas | old: working on it | wgEncodeGencodeAutoV3b_noTrnas | old: working on it | wgEncodeGencodeClassesRel1 | old: v1 | wgEncodeGencodeClassesRel2 | old: v2 | wgEncodeGencodeClassesRel2_full | old: working on it | wgEncodeGencodeClassesRel2_unused | old: working on it | wgEncodeGencodeClassesV3b | old: v3b | wgEncodeGencodeManualRel1 | old: v1 | wgEncodeGencodeManualRel2 | old: v2 | wgEncodeGencodeManualV3 | <== New v3c | wgEncodeGencodeManualV3b | old: v3b | wgEncodeGencodePolyaRel2 | old: v2 | wgEncodeGencodePolyaV3b | old: v3b !!! Once again: geneId is used as name2 by default but I need geneName!!! # Use my local version of ldHgGene, since I edited genePred.c but never did the plumbing to make it a full option # because: geneId is used as name2 by default but I need geneName!!! # empty table! create table wgEncodeGencodeManualV3_old select * from wgEncodeGencodeManualV3; # Records: 87627 Duplicates: 0 Warnings: 0 delete from wgEncodeGencodeManualV3; ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeManualV3 gencode.v3c.annotation.NCBI36.level_1_2.gtf > reload_manual.out 2>&1 & Reading gencode.v3c.annotation.NCBI36.level_1_2.gtf Read 87627 transcripts in 1154766 lines in 1 files 87627 groups 24 seqs 1 sources 6 feature types 87627 gene predictions | 585 | ENST00000450305 | chr1 | + | 1872 | 3533 | 3533 | 3533 | 6 | 1872,2041,2475,2837,3083,3315, | 1920,2090,2560,2915,3237,3533, | 0 | RP11-34P13.1 | none | none | -1,-1,-1,-1,-1,-1, | ------------ drop table wgEncodeGencodeManualV3_old; # empty table! create table wgEncodeGencodeAutoV3_noTrnas select * from wgEncodeGencodeAutoV3; # Records: 43889 Duplicates: 0 Warnings: 0 delete from wgEncodeGencodeAutoV3_noTrnas; create table wgEncodeGencodeAutoV3_Trnas select * from wgEncodeGencodeAutoV3_noTrnas; # Records: 0 Duplicates: 0 Warnings: 0 ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencodeAutoV3_noTrnas gencode.v3c.annotation.NCBI36.level_3.gtf > reload_auto.out 2>&1 & Reading gencode.v3c.annotation.NCBI36.level_3.gtf Read 43889 transcripts in 867844 lines in 1 files 43889 groups 25 seqs 1 sources 6 feature types 43889 gene predictions /cluster/bin/x86_64/ldHgGene -exon=tRNAscan -genePredExt -noncoding -oldTable hg18 wgEncodeGencodeAutoV3_Trnas gencode.v3.tRNAs.NCBI36.gtf > load_tRnas.out 2>&1 & Reading gencode.v3.tRNAs.NCBI36.gtf Read 621 transcripts in 621 lines in 1 files 621 groups 24 seqs 1 sources 1 feature types 621 gene predictions drop table wgEncodeGencodeAutoV3; create table wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_noTrnas; insert into wgEncodeGencodeAutoV3 select * from wgEncodeGencodeAutoV3_Trnas; select count(*) from wgEncodeGencodeAutoV3;| 44510 | | 585 | ENST00000456328 | chr1 | + | 1736 | 4275 | 4021 | 4252 | 3 | 1736,2475,3083, | 2090,2584,4275, | 0 | RP11-34P13.1 | cmpl | cmpl | -1,-1,0, | ------------ drop table wgEncodeGencodeAutoV3_noTrnas_old; ### Now the difficult part of loading polyAs, 2way and classes # determine if only 3 types of polyA: grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site # Header only header grep -v polyA_signal gencode.v3.polyAs.NCBI36.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v3.polyAs.NCBI36.header.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.signal.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.pseudo.gtf cp gencode.v3.polyAs.NCBI36.header.gtf gencode.v3.polyAs.NCBI36.site.gtf grep polyA_signal gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.signal.gtf grep pseudo_polyA gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.pseudo.gtf grep polyA_site gencode.v3.polyAs.NCBI36.gtf >> gencode.v3.polyAs.NCBI36.site.gtf # So now the possible exon types are: polyA_signal,pseudo_polyA,polyA_site # empty table: create table wgEncodeGencodePolyaV3 select * from wgEncodeGencodePolyaV3b; delete from wgEncodeGencodePolyaV3; ~/bin/x86_64/ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.signal.gtf > load_polyA.out 2>&1 /cluster/bin/x86_64/ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.pseudo.gtf >> load_polyA.out 2>&1 /cluster/bin/x86_64/ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg18 wgEncodeGencodePolyaV3 gencode.v3.polyAs.NCBI36.site.gtf >> load_polyA.out 2>&1 Reading gencode.v3.polyAs.NCBI36.signal.gtf Read 21397 transcripts in 21397 lines in 1 files 21397 groups 24 seqs 1 sources 2 feature types 21364 gene predictions Reading gencode.v3.polyAs.NCBI36.pseudo.gtf Read 1973 transcripts in 1973 lines in 1 files 1973 groups 24 seqs 1 sources 3 feature types 1961 gene predictions Reading gencode.v3.polyAs.NCBI36.site.gtf Read 36 transcripts in 36 lines in 1 files 36 groups 6 seqs 1 sources 1 feature types 36 gene predictions # Now 2way... create table wgEncodeGencode2WayV3 select * from wgEncodeGencodePolyaV3; delete from wgEncodeGencode2WayV3; ~/bin/x86_64/ldHgGene -gtf -genePredExt hg18 wgEncodeGencode2WayV3 gencode.v3.2wayconspseudos.NCBI36.gtf > load_2way.out 2>&1 Reading gencode.v3.2wayconspseudos.NCBI36.gtf Read 9474 transcripts in 9474 lines in 1 files 9474 groups 24 seqs 1 sources 1 feature types 0 gene predictions ### Didn't work... grep Yale_UCSC gencode.v3.2wayconspseudos.NCBI36.gtf | wl 9474 grep transcript gencode.v3.2wayconspseudos.NCBI36.gtf | wl 9474 # so lets treat "transcript as the exon" ~/bin/x86_64/ldHgGene -exon=transcript -genePredExt -noncoding -oldTable hg18 wgEncodeGencode2WayV3 gencode.v3.2wayconspseudos.NCBI36.gtf > load_2way.out 2>&1 Reading gencode.v3.2wayconspseudos.NCBI36.gtf Read 9474 transcripts in 9474 lines in 1 files 9474 groups 24 seqs 1 sources 1 feature types 9474 gene predictions | 585 | Overlap414 | chr1 | - | 118891 | 123443 | 0 | 0 | 1 | 118891, | 123443, | 0 | Overlap414 | none | none | -1, | select count(*) from wgEncodeGencode2WayV3;| 9474 | # Time to build the classes table ls -1 *.NCBI36*.classes gencode.v3.2wayconspseudos.NCBI36.classes gencode.v3.polyAs.NCBI36.classes gencode.v3.tRNAs.NCBI36.classes gencode.v3c.annotation.NCBI36.level_1_2.classes gencode.v3c.annotation.NCBI36.level_3.classes head -2 gencode.v3c.annotation.NCBI36.level_1_2.classes geneId transcriptId transcriptType level Class OTT TranscriptId OTT GeneId ENSG00000223972 ENST00000450305 unprocessed_pseudogene 2 Havana_unprocessed_pseudogene OTTHUMG00000000961 OTTHUMT00000002844 ENSG00000227232 ENST00000488147 unprocessed_pseudogene 2 Havana_unprocessed_pseudogene OTTHUMG00000000958 OTTHUMT00000002839 head -1 gencode.v3c.annotation.NCBI36.level_3.classes ENSG00000223972 ENST00000456328 protein_coding 3 Ensembl_coding . . # Notice that OTTs are '.' head -2 gencode.v3.2wayconspseudos.NCBI36.classes Overlap1 Overlap1 pseudogene 3 2way_pseudogene Overlap2 Overlap2 pseudogene 3 2way_pseudogene head -1 gencode.v3.polyAs.NCBI36.classes 440716 440716 polyA_signal 2 Havana_polyA head -1 gencode.v3.tRNAs.NCBI36.classes 38172 38172 tRNAscan 3 Ensembl_RNA # Notice that there are not fields for OTT. What to do? Load them then set all '.' OTTs to NULL E) Submit F) load uniq.tabs into hg18 temporarily. echo "CREATE TABLE wgEncodeGencodeClassesV3 ( geneId varchar(255) not null, name varchar(255) not null, transcriptType varchar(255) not null, level integer, class enum ('Undefined', 'Validated_coding','Validated_processed','Validated_processed_pseudogene','Validated_unprocessed_pseudogene','Validated_pseudogene', 'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA','Havana_processed_pseudogene','Havana_unprocessed_pseudogene','Havana_pseudogene','Havana_TEC', 'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene','Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA') not null default 'Undefined', ottTranscriptId varchar(255), ottGeneId varchar(255), # indexes INDEX(transcriptType), INDEX(level,class), PRIMARY KEY(name));" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3c.annotation.NCBI36.level_1_2.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3c.annotation.NCBI36.level_3.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3.tRNAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3.polyAs.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.NCBI36.classes' into table wgEncodeGencodeClassesV3" | hgsql hg18 select count(*) from wgEncodeGencodeClassesV3;| 164972 | select count(*) from wgEncodeGencodeAutoV3; | 44510 | + select count(*) from wgEncodeGencodeManualV3; | 87627 | + select count(*) from wgEncodeGencodePolyaV3; | 23361 | + select count(*) from wgEncodeGencode2WayV3; | 9474 | = 164972 show tables like "wgEncodeGencode%V3%"; +--------------------------------------+ | Tables_in_hg18 (wgEncodeGencode%V3%) | +--------------------------------------+ | wgEncodeGencode2WayV3 | <== New v3c | wgEncodeGencodeAutoV3 | <== New v3c | wgEncodeGencodeAutoV3_Trnas | <== Save for building AutoV3 | wgEncodeGencodeAutoV3_noTrnas | <== Save for building AutoV3 | wgEncodeGencodeAutoV3b | Old V3b | wgEncodeGencodeAutoV3b_Trnas | Old V3b for building AutoV3b | wgEncodeGencodeAutoV3b_noTrnas | Old V3b for building AutoV3b | wgEncodeGencodeClassesV3 | <== New v3c | wgEncodeGencodeClassesV3b | Old V3b | wgEncodeGencodeManualV3 | <== New v3c | wgEncodeGencodeManualV3b | Old V3b | wgEncodeGencodePolyaV3 | <== New v3c | wgEncodeGencodePolyaV3b | Old V3b +--------------------------------------+ 6 rows in set (0.09 sec) select count(*) from wgEncodeGencodeClassesV3 where class = "Undefined"; | 0 | select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_coding"; | 0 | select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_processed"; | 0 | select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_processed_pseudogene"; | 3028 | select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_unprocessed_pseudogene";| 67 | select count(*) from wgEncodeGencodeClassesV3 where class = "Validated_pseudogene"; | 74 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_coding"; | 36639 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_non_coding"; | 37583 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_nonsense"; | 4688 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_polyA"; | 23361 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_processed_pseudogene"; | 3261 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_unprocessed_pseudogene";| 1344 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_pseudogene"; | 895 | select count(*) from wgEncodeGencodeClassesV3 where class = "Havana_TEC"; | 48 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_coding"; | 32289 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_non_coding"; | 395 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_processed_pseudogene"; | 138 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_unprocessed_pseudogene";| 9 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_pseudogene"; | 1852 | select count(*) from wgEncodeGencodeClassesV3 where class = "Ensembl_RNA"; | 9825 | select count(*) from wgEncodeGencodeClassesV3 where class = "";| 9476 | # Yipes. However, this is a familiar number select distinct class,level,transcriptType from wgEncodeGencodeClassesV3 order by class,level,transcriptType; | | 3 | non_coding | | | 3 | pseudogene | select count(*) from wgEncodeGencodeClassesV3 where transcriptType = "non_coding" and class = ""; | 2 | | ENSG00000225880 | ENST00000426669 | non_coding | 3 | | . | . | | ENSG00000178796 | ENST00000431954 | non_coding | 3 | | . | . | update wgEncodeGencodeClassesV3 set class = "Ensembl_non_coding" where transcriptType = "non_coding" and class = ""; ### Note back to Felix: update wgEncodeGencodeClassesV3 set class = "Ensembl_non_coding" where transcriptType = "non_coding" and class = ""; ### Note back to Felix: select count(*) from wgEncodeGencodeClassesV3 where ottGeneId = '.'; | 54613 | select count(*) from wgEncodeGencodeClassesV3 where ottGeneId = '.' and ottTranscriptId = '.';| 54613 | update wgEncodeGencodeClassesV3 set ottGeneId = NULL,ottTranscriptId = NULL where ottGeneId = '.' and ottTranscriptId = '.'; # Rows matched: 54613 Changed: 54613 Warnings: 0 select count(*) from wgEncodeGencodeClassesV3 where transcriptType = "pseudogene" and class = ""; | 9474 | # familiar number needs new class? select count(*) from wgEncodeGencodeClassesV3 where class = ""; | 9474 | # familiar number needs new class? update wgEncodeGencodeClassesV3 set class = "2way_consensus_pseudogene" where transcriptType = "pseudogene" and class = ""; ## Rows matched: 9474 Changed: 0 Warnings: 9474 select count(*) from wgEncodeGencodeClassesV3 where class = "2way_consensus_pseudogene"; | 0 | # Must change class enum definition first!!! rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3_tmp; echo "CREATE TABLE wgEncodeGencodeClassesV3 ( geneId varchar(255) not null, name varchar(255) not null, transcriptType varchar(255) not null, level integer, class enum ('Undefined', 'Validated_coding','Validated_processed','Validated_processed_pseudogene', 'Validated_unprocessed_pseudogene','Validated_pseudogene', 'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA', 'Havana_processed_pseudogene','Havana_unprocessed_pseudogene', 'Havana_pseudogene','Havana_TEC', 'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene', 'Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA', '2way_consensus_pseudogene') not null default 'Undefined', ottTranscriptId varchar(255), ottGeneId varchar(255), # indexes INDEX(transcriptType), INDEX(level,class), PRIMARY KEY(name));" | hgsql hg18 insert into wgEncodeGencodeClassesV3 select * from wgEncodeGencodeClassesV3_tmp; # Records: 164972 Duplicates: 0 Warnings: 0 ### Note back to Felix: update wgEncodeGencodeClassesV3 set class = "2way_consensus_pseudogene" where transcriptType = "pseudogene" and class = ""; # Rows matched: 9474 Changed: 9474 Warnings: 0 drop table wgEncodeGencodeClassesV3_tmp; select distinct class,level,transcriptType from wgEncodeGencodeClassesV3 order by class,level,transcriptType; | 2way_consensus_pseudogene | 3 | pseudogene | | Validated_processed_pseudogene | 1 | processed_pseudogene | | Validated_processed_pseudogene | 1 | transcribed_processed_pseudogene | | Validated_unprocessed_pseudogene | 1 | transcribed_unprocessed_pseudogene | | Validated_unprocessed_pseudogene | 1 | unprocessed_pseudogene | | Validated_pseudogene | 1 | IG_pseudogene | | Validated_pseudogene | 1 | pseudogene | | Validated_pseudogene | 1 | unitary_pseudogene | | Havana_coding | 2 | IG_C_gene | | Havana_coding | 2 | IG_D_gene | | Havana_coding | 2 | IG_J_gene | | Havana_coding | 2 | IG_V_gene | | Havana_coding | 2 | protein_coding | | Havana_non_coding | 2 | ambiguous_orf | | Havana_non_coding | 2 | antisense | | Havana_non_coding | 2 | non_coding | | Havana_non_coding | 2 | processed_transcript | | Havana_non_coding | 2 | retained_intron | | Havana_nonsense | 2 | nonsense_mediated_decay | | Havana_polyA | 2 | polyA_signal | | Havana_polyA | 2 | pseudo_polyA | | Havana_processed_pseudogene | 2 | processed_pseudogene | | Havana_processed_pseudogene | 2 | transcribed_processed_pseudogene | | Havana_unprocessed_pseudogene | 2 | transcribed_unprocessed_pseudogene | | Havana_unprocessed_pseudogene | 2 | unprocessed_pseudogene | | Havana_pseudogene | 2 | IG_pseudogene | | Havana_pseudogene | 2 | polymorphic_pseudogene | | Havana_pseudogene | 2 | pseudogene | | Havana_pseudogene | 2 | retrotransposed | | Havana_pseudogene | 2 | TR_pseudogene | | Havana_pseudogene | 2 | unitary_pseudogene | | Havana_TEC | 2 | artifact | | Havana_TEC | 2 | TEC | | Ensembl_coding | 3 | IG_C_gene | | Ensembl_coding | 3 | IG_D_gene | | Ensembl_coding | 3 | IG_J_gene | | Ensembl_coding | 3 | IG_V_gene | | Ensembl_coding | 3 | protein_coding | | Ensembl_non_coding | 3 | antisense | | Ensembl_non_coding | 3 | non_coding | | Ensembl_non_coding | 3 | processed_transcript | | Ensembl_non_coding | 3 | retained_intron | | Ensembl_processed_pseudogene | 3 | processed_pseudogene | | Ensembl_unprocessed_pseudogene | 3 | unprocessed_pseudogene | | Ensembl_pseudogene | 3 | IG_pseudogene | | Ensembl_pseudogene | 3 | miRNA_pseudogene | | Ensembl_pseudogene | 3 | misc_RNA_pseudogene | | Ensembl_pseudogene | 3 | pseudogene | | Ensembl_pseudogene | 3 | retrotransposed | | Ensembl_pseudogene | 3 | unitary_pseudogene | | Ensembl_RNA | 3 | miRNA | | Ensembl_RNA | 3 | misc_RNA | | Ensembl_RNA | 3 | Mt_rRNA | | Ensembl_RNA | 3 | Mt_tRNA | | Ensembl_RNA | 3 | Mt_tRNA_pseudogene | | Ensembl_RNA | 3 | rRNA | | Ensembl_RNA | 3 | rRNA_pseudogene | | Ensembl_RNA | 3 | scRNA_pseudogene | | Ensembl_RNA | 3 | snoRNA | | Ensembl_RNA | 3 | snoRNA_pseudogene | | Ensembl_RNA | 3 | snRNA | | Ensembl_RNA | 3 | snRNA_pseudogene | | Ensembl_RNA | 3 | tRNAscan | | Ensembl_RNA | 3 | tRNA_pseudogene | | 2way_consensus_pseudogene | 3 | pseudogene | select distinct class,level,transcriptType from wgEncodeGencode2WayV3 order by class,level,transcriptType; # Oops, ottTranscriptId and ottGeneId appear to be reversed! rename table wgEncodeGencodeClassesV3 to wgEncodeGencodeClassesV3_tmp; echo "CREATE TABLE wgEncodeGencodeClassesV3 ( geneId varchar(255) not null, name varchar(255) not null, transcriptType varchar(255) not null, level integer, class enum ('Undefined', 'Validated_coding','Validated_processed','Validated_processed_pseudogene', 'Validated_unprocessed_pseudogene','Validated_pseudogene', 'Havana_coding','Havana_non_coding','Havana_nonsense','Havana_polyA', 'Havana_processed_pseudogene','Havana_unprocessed_pseudogene', 'Havana_pseudogene','Havana_TEC', 'Ensembl_coding','Ensembl_non_coding','Ensembl_processed_pseudogene', 'Ensembl_unprocessed_pseudogene','Ensembl_pseudogene','Ensembl_RNA', '2way_consensus_pseudogene') not null default 'Undefined', ottGeneId varchar(255), ottTranscriptId varchar(255), # indexes INDEX(transcriptType), INDEX(level,class), PRIMARY KEY(name));" | hgsql hg18 insert into wgEncodeGencodeClassesV3 select * from wgEncodeGencodeClassesV3_tmp; # Records: 164972 Duplicates: 0 Warnings: 0 drop table wgEncodeGencodeClassesV3_tmp; ## Now edit trackDb.wgEncode.ra making a new 2way subtrack # Spend time massaging metadata # set up search specs for hgFindSpec /cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed # change metadata to "Gencode October 2009 Freeze" # Search spec OTT ID??? # in trackDb.wgEncode.ra... # (added Angie's hgFindSpec wiki entry) needed xrefQuery to wgEncodeGencode Classes # Need to get downloads dir in order cd {downloadsDir} mkdir release2/ cp index.html release2/ cp fileDb.ra release2/ mv wgEncodeGencode*Rel2*.gz release2/ # wgEncodeSangerGencodeGencode versions were made by pipeline mv wgEncodeSangerGencodeGencodeManualV3.gtf.gz wgEncodeGencodeManualV3.gtf.gz mv wgEncodeSangerGencodeGencodeAutoV3.gtf.gz wgEncodeGencodeAutoV3.gtf.gz # manually make the reset pushd {pip}/453 cat gencode.v3c.annotation.NCBI36.level_1_2.classes >> gencode.v3c.NCBI36.all.classes cat gencode.v3c.annotation.NCBI36.level_3.classes >> gencode.v3c.NCBI36.all.classes cat gencode.v3.tRNAs.NCBI36.classes >> gencode.v3c.NCBI36.all.classes cat gencode.v3.2wayconspseudos.NCBI36.classes >> gencode.v3c.NCBI36.all.classes cat gencode.v3.polyAs.NCBI36.classes >> gencode.v3c.NCBI36.all.classes wl gencode.v3c.NCBI36.all.classes 164972 pop gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.2wayconspseudos.NCBI36.gtf > wgEncodeGencode2wayV3.gtf.gz gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.polyAs.NCBI36.gtf > wgEncodeGencodePolyaV3.gtf.gz gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3.tRNAs.NCBI36.gtf > wgEncodeGencodeTrnasV3.gtf.gz gzip --stdout /hive/groups/encode/dcc/pipeline/encpipeline_prod/453/gencode.v3c.NCBI36.all.classes > wgEncodeGencodeClassesV3.tab.gz #edit fileDb.ra encodeDownloadsPage.pl index.html /cluster/data/encode/pipeline/bin/encodeStatus.pl 453 displayed # Rachel says that pseudogenes need links out to Yale. Oh boy. That will wait til another day! create table trackDb_qateam select * from trackDb_tdreszer; create table hgFindSpec_qateam select * from hgFindSpec_tdreszer;