#!/bin/csh exit; DONE First chimp release # DOWNLOAD FILES FROM WHITEHEAD (kate) #ftp-genome.wi.mit.edu #Dir: Assembly.Nov13.2003 # Files: # contigs.bases.gz # contigs.quals.gz # assembly.agp gunzip *.gz #The scaffold count is 37930 #% faSize contigs.bases # Get AGP from LaDeana Hillier ssh kksilo cd /cluster/data/panTro1 wget genome.wustl.edu:/private/lhillier/old/pan_troglodytes_agp.040119.tar.gz gunzip pan_troglodytes_agp.040119.tar.gz tar xvf pan_troglodytes_agp.040119.tar rm pan_troglodytes_agp.040119.tar # creates dir pan_troglodytes_agp, with an AGP per chrom # REPEATMASK (2003-12-03 and 2004-01-20 kate) # Split into 500KB chunks for RepeatMasking ssh kksilo cd /cluster/data/panTro1 mkdir -p split500K faSplit sequence contigs.bases 10 split500K/ cd split500K foreach f (*.fa) set d = $f:t:r mkdir -p $d faSplit about $f 500000 $d/ rm $f end # Create RM script and cluster job # NOTE: actual run was performed in two cluster jobs -- the second # was RMRun.chimpLib, for the chimp-specific repeats cd /cluster/data/panTro1 mkdir -p jkStuff mkdir -p RMRun rm -f RMRun/RMJobs cat << '_EOF_' > jkStuff/RMChimp #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/panTro1/$2 /bin/cp $2 /tmp/panTro1/$2 cd /tmp/panTro1/$2 # mask with chimp lib /cluster/bluearc/RepeatMasker/RepeatMasker -s -ali -nolow -lib /cluster/bluearc/RepeatMasker030619/Libraries/chimp.lib $2 popd /bin/cp /tmp/panTro1/$2/$2.out ./$2.chimp.out if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.chimp.align /bin/rm -f /tmp/panTro1/$2/* cd $1 pushd . /bin/cp $2 /tmp/panTro1/$2 cd /tmp/panTro1/$2 # mask with default primate lib /cluster/bluearc/RepeatMasker/RepeatMasker -s -ali $2 popd /bin/cp /tmp/panTro1/$2/$2.out ./$2.out if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.align /bin/rm -fr /tmp/panTro1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1 '_EOF_' chmod +x jkStuff/RMChimp mkdir -p RMRun rm -f RMRun/RMJobs foreach d ( split500K/?? ) foreach f ( $d/*.fa ) set f = $f:t echo /cluster/data/panTro1/jkStuff/RMChimp \ /cluster/data/panTro1/$d $f \ '{'check out line+ /cluster/data/panTro1/$d/$f.out'}' \ >> RMRun/RMJobs end end #5367 RMJobs # Run cluster job sh kk cd /cluster/data/panTro1/RMRun para create RMJobs para try, para check, para check, para push, para check,... # TRF: Tandem Repeat Finder (DONE 2004-01-19 kate) # create job list of 5MB chunks # Redoing this instead of using the pt0 trf beds, so as to # include the new and changed scaffolds in the Nov. 13 assembly ssh kksilo cd /cluster/data/panTro1 mkdir -p split5M mkdir -p /cluster/bluearc/panTro1/split5M faSplit about contigs.bases 5000000 /cluster/bluearc/panTro1/split5M/ cd /cluster/data/panTro1 mkdir -p bed/simpleRepeat cd bed/simpleRepeat mkdir trf ls -1 /cluster/bluearc/panTro1/split5M/*.fa > genome.lst touch single cat << 'EOF' > gsub #LOOP /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line+ /cluster/data/panTro1/bed/simpleRepeat/trf/$(root1).bed} -tempDir=/tmp #ENDLOOP 'EOF' gensub2 genome.lst single gsub spec ssh kk cd /cluster/data/panTro1/bed/simpleRepeat para create spec # 546 jobs written to batch para try para push # MAKE LINEAGE-SPECIFIC REPEATS FOR HUMAN (DONE 2004-06-21 kate) # Lacking input from Arian, and using blastzSelf as a model, # I'm using all chimp repeats for the human/chimp blastz. # Scripts expect *.out.spec filenames. ssh kkr1u00 cd /cluster/data/panTro1 mkdir /iscratch/i/chimp/panTro1/linSpecRep.human foreach f (?{,}/*.fa.out) cp -p $f /iscratch/i/chimp/panTro1/linSpecRep.human/$f:t:r:r.out.spec end iSync # FILTER SIMPLE REPEATS INTO MASK (DONE 2004-01-25 kate) # make a filtered version # of the trf output: # keep trf's with period <= 12: # NOTE: did chr1_random on 2004-01-26 kate) ssh kksilo cd /cluster/data/panTro1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) echo "filtering $f" awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # MASK CONTIGS WITH REPEATMASKER (DONE 2004-01-20 kate) ssh kksilo cd /cluster/data/panTro1 cd split500K cat > maskRM.csh << 'EOF' foreach d (??) foreach f ($d/*.fa) echo "Masking $f" maskOutFa $f $f.out $f.rmsk1 -soft maskOutFa $f.rmsk1 $f.chimp.out $f.rmsk -softAdd end end 'EOF' csh maskRM.csh >&! maskRM.log & tail -100f maskRM.log #WARNING: negative rEnd: -92 contig_143568:1128-1159 MER46B #WARNING: negative rEnd: -155 contig_143869:5374-5508 AluJb # etc. # Comment in rn3 doc (Hiram) indicates these are OK... # MASK CONTIGS WITH TRF # Merge 500K masked chunks into single file, then split into 5Mb chunks # to prepare for TRF masking ssh kksilo cd /cluster/data/panTro1 cd split500K foreach d (??) echo "Contig dir $d" foreach f ($d/?.fa.rmsk $d/??.fa.rmsk $d/???.fa.rmsk) set g = $f:h cat $f >> $g.fa end end # check the split500K/??.fa masked files, then create single # fasta file with repeatmasked sequence cd /cluster/data/panTro1 cat split500K/??.fa > contigs.bases.rmsk # check the rmsk file, then rm split500K/??.fa mkdir -p split5M.rmsk faSplit about contigs.bases.rmsk 5000000 split5M.rmsk/ cat > bed/simpleRepeat/runTrf.csh << 'EOF' foreach f (split5M.rmsk/*.fa) echo "TRF Masking $f" set b = $f:t:r maskOutFa $f bed/simpleRepeat/trfMask/$b.bed $f.msk -softAdd end 'EOF' csh bed/simpleRepeat/runTrf.csh >&! bed/simpleRepeat/runTrf.log & tail -100f bed/simpleRepeat/runTrf.log # create single fasta file with repeatmasked and trf-masked sequence cat split5M.rmsk/*.fa.msk > contigs.bases.msk # MAKE SCAFFOLDS FROM CONTIGS (DONE 2004-01-20 kate) agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa -sizes=scaffold /cluster/bin/scripts/agpToLift < assembly.agp > assembly.lft # MAKE CHROMS FROM SCAFFOLDS (DONE 2004-01-20 kate, except chrUn) # NOTE: chr1_random lost somewhere -- redone 2004-01-26 kate ssh kksilo cd /cluster/data/panTro1 cp scaffolds.msk.fa /cluster/bluearc/panTro1 cat > jkStuff/makeChr.csh << 'EOF' cd /cluster/data/panTro1/pan_troglodytes_agp foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y Un M) echo $c mkdir -p ../$c if (-e ptr${c}.agp) then # Remove comments as agpToFa can't handle them # NOTE: chr names are "ptrN" in the AGP files sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}.agp > ../$c/chr${c}.agp ~/bin/x86_64/agpToFa -simpleMultiMixed \ ../$c/chr${c}.agp chr${c} \ /cluster/data/panTro1/$c/chr${c}.fa \ /cluster/bluearc/panTro1/scaffolds.msk.fa endif if (-e ptr${c}_random.agp) then sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}_random.agp \ > ../$c/chr${c}_random.agp ~/bin/x86_64/agpToFa -simpleMultiMixed \ ../$c/chr${c}_random.agp chr${c}_random \ /cluster/data/panTro1/$c/chr${c}_random.fa \ /cluster/bluearc/panTro1/scaffolds.msk.fa endif end 'EOF' ssh kolossus cd /cluster/data/panTro1 csh jkStuff/makeChr.csh >&! jkStuff/makeChr.log & tail -100f jkStuff/makeChr.log # Un # Program error: trying to allocate 807899783 bytes in needLargeMem # agpToFa: memalloc.c:82: needLargeMem: Assertion `0' failed. # Abort (core dumped) # This is an 800M chrom! (1500 scaffolds with 50K gaps) # Have requested smaller gaps, will redo it # REDO chrUn with revised AGP file (10K gaps) from LaDeana Hillier # (IN PROGRESS 2004-01-23 kate) ssh kolossus cd /cluster/data/panTro1/pan_troglodytes_agp set c = Un sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}_random.agp \ > ../$c/chr${c}_random.agp ~/bin/x86_64/agpToFa -simpleMultiMixed \ ../$c/chr${c}_random.agp chr${c}_random \ /cluster/data/panTro1/$c/${c}_random.fa \ /cluster/bluearc/panTro1/scaffolds.msk.fa ssh kksilo cd /cluster/data/panTro1 cat > jkStuff/makeNib.csh << 'EOF' cd /cluster/data/panTro1 foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y M Un) echo $c cd $c foreach f (*.fa) /cluster/bin/i386/faToNib -softMask $f ../nib/chr$f:r.nib end cd .. end 'EOF' csh jkStuff/makeNib.csh >&! jkStuff/makeNib.log & tail -100f jkStuff/makeNib.log # Make symbolic links from /gbdb/panTro1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/panTro1/nib foreach f (/cluster/data/panTro1/nib/chr*.nib) ln -s $f /gbdb/panTro1/nib end # make lift files from scaffold->chrom agp's, with agpToLift that # supports negative strand ssh kksilo cd /cluster/data/panTro1 foreach d (?{,?}) echo $d if (-e $d/chr$d.agp) then echo "lifting chr$d.agp" jkStuff/agpToLift < $d/chr$d.agp > $d/chr$d.lft endif if (-e $d/chr${d}_random.agp) then echo "lifting chr${d}_random.agp" jkStuff/agpToLift < $d/chr${d}_random.agp > $d/chr${d}_random.lft endif end cat ?{,?}/*.lft > jkStuff/scaffolds.lft # CREATE DATABASE (IN PROGRESS 2004-01-21 kate, still needs chrUn in chromINfo) ssh hgwdev # use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 211G 1.5T 13% /var/lib/mysql hgsql -e "CREATE DATABASE panTro1" # if you need to delete this database: !!! WILL DELETE EVERYTHING !!! # hgsql -e "drop database panTro1" # create "grp" table for track grouping hgsql panTro1 \ -e "CREATE TABLE grp (PRIMARY KEY(NAME)) select * from panTro1.grp" # Load /gbdb/panTro1/nib paths into database and save size info. hgsql panTro1 < ~/kent/src/hg/lib/chromInfo.sql cd /cluster/data/panTro1 hgNibSeq -preMadeNib panTro1 /gbdb/panTro1/nib ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql -N panTro1 > chrom.sizes # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (IN PROGERSS 2004-01-20 kate) ssh hgwdev echo 'INSERT INTO defaultDb VALUES ("Chimp", "panTro1");' \ | hgsql -h genome-testdb hgcentraltest # Warning: genome and organism fields must correspond # with defaultDb values, start as not ACTIVE echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, \ scientificName, htmlPath, hgNearOK) values \ ("panTro1", "Nov. 2003", "/gbdb/panTro1/nib", "Chimp", \ "chr6:115705331-115981791", 0, 15, "Chimp", \ "Pan troglodytes", "/gbdb/panTro1/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb mkdir -p chimp/panTro1 # Add chimp/panTro1/description.html cvs up -d -P # Edit that makefile to add panTro1 in all the right places and do make update # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram) ssh hgwdev echo 'INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("panTro1", "blat8", "17778", "1"); \ INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("panTro1", "blat8", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # ASSEMBLY AND GAP TRACKS (DONE 2004-01-26 kate) ssh kksilo cd /cluster/data/panTro1 ssh hgwdev hgGoldGapGl -noGl panTro1 /cluster/data/panTro1 . # generate contig-level gaps from .gap files generated by scaffoldFaToAgp ssh kksilo cd /cluster/data/panTro1 ~kate/bin/i386/liftUp assembly.chrom.gap jkStuff/scaffolds.lft warn assembly.scaffold.gap # extract and merge contig-level and scaffold level gaps into .gap files ssh hgwdev cd /cluster/data/panTro1 cat > jkStuff/makeGaps.csh << 'EOF' cd /cluster/data/panTro1 foreach c (?{,?}) echo "loading gaps for chr$c" cd $c if ( -e chr$c.agp ) then sed -n "/^chr$c\t/p" ../assembly.chrom.gap | sort -n +1 > chr$c.frag.gap grep 'N' chr$c.agp > chr$c.contig.gap cat chr$c.frag.gap chr$c.contig.gap | sort -n +1 > chr$c.gap endif if ( -e chr${c}_random.agp ) then grep "chr${c}_random" ../assembly.chrom.gap | sort -n +1 > chr${c}_random.frag.gap grep 'N' chr${c}_random.agp > chr${c}_random.contig.gap cat chr${c}_random.frag.gap chr${c}_random.contig.gap | sort -n +1 > chr${c}_random.gap endif rm chr*.frag.gap chr*.contig.gap cd .. end 'EOF' csh jkStuff/makeGaps.csh > jkStuff/makeGaps.log & tail -100f jkStuff/makeGaps.log hgLoadGap panTro1 /cluster/data/panTro1 featureBits panTro1 gold # 3109246583 bases of 3109246583 (100.000%) in intersection featureBits panTro1 gap # 1311128857 bases of 3109246583 (42.169%) in intersection # REPEATMASKER TRACK (DONE 2004-01-21 kate) # Split chrom into 5MB chunks ssh kksilo cd /cluster/data/panTro1 cat > jkStuff/splitChrom.csh << 'EOF' foreach c (?{,?}) echo $c if (-e $c/chr$c.fa) then cp -p $c/chr$c.agp $c/chr$c.agp.bak cp -p $c/chr$c.fa $c/chr$c.fa.bak splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000 endif if (-e $c/chr${c}_random.fa) then cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . -nSize=5000000 mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst mv ${c}_random/lift/ordered.lft $c/lift/random.lft mv ${c}_random/lift/ordered.lst $c/lift/random.lst rmdir ${c}_random/lift rm ${c}_random/chr${c}_random.{agp,fa} rm -fr $c/chr${c}_random_1 rm -fr $c/chr${c}_random_2 mv ${c}_random/* $c rmdir ${c}_random endif end 'EOF' csh jkStuff/splitChrom.csh > jkStuff/splitChrom.log & tail -100f jkStuff/splitChrom.log # make liftall.lft ssh kksilo cd /cluster/data/panTro1 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # Split these pseudo-contigs into 500Kb chunks ssh kksilo cd /cluster/data/panTro1 foreach d ( */chr*_?{,?} ) cd $d echo "splitting $d" set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end # Create RM script and cluster job # NOTE: actual run was performed in two cluster jobs -- the second # was RMRun.chimpLib, for the chimp-specific repeats cd /cluster/data/panTro1 mkdir -p jkStuff mkdir -p RMRun.chrom cat << '_EOF_' > jkStuff/RMChimp.chrom #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/panTro1/$2 /bin/cp $2 /tmp/panTro1/$2 cd /tmp/panTro1/$2 # mask with chimp lib /cluster/bluearc/RepeatMasker030619/RepeatMasker -s -ali -nolow -lib /cluster/bluearc/RepeatMasker030619/Libraries/chimp.lib $2 popd /bin/cp /tmp/panTro1/$2/$2.out ./$2.chimp.out if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.chimp.align /bin/rm -f /tmp/panTro1/$2/* cd $1 pushd . /bin/cp $2 /tmp/panTro1/$2 cd /tmp/panTro1/$2 # mask with default primate lib /cluster/bluearc/RepeatMasker030619/RepeatMasker -s -ali $2 popd /bin/cp /tmp/panTro1/$2/$2.out ./$2.out if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.align /bin/rm -fr /tmp/panTro1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1 '_EOF_' chmod +x jkStuff/RMChimp.chrom mkdir -p RMRun.chrom rm -f RMRun.chrom/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}{,?}.fa ) set f = $f:t echo /cluster/data/panTro1/jkStuff/RMChimp.chrom \ /cluster/data/panTro1/$d $f \ '{'check out line+ /cluster/data/panTro1/$d/$f.out'}' \ >> RMRun.chrom/RMJobs end end #- Do the run ssh kk cd /cluster/data/panTro1/RMRun.chrom para create RMJobs para try, para check, para check, para push, para check,... # additional run for chrUn in RMRun.chrUn (2004-01-24 kate) # additional run for chr1_random in RMRun.chr1_random (2004-01-26 kate) # 383 jobs written to batch #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/panTro1 cat > jkStuff/liftPseudoContigOuts.csh << 'EOF' foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t echo $contig liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out ${contig}_*.fa.chimp.out > /dev/null cd ../.. end 'EOF' csh jkStuff/liftPseudoContigOuts.csh >&! jkStuff/liftPseudoContigOuts.log & tail -100f jkStuff/liftPseudoContigOuts.log # Lift pseudo-contigs to chromosome level cat > jkStuff/liftToChromOut.csh << 'EOF' foreach i (?{,?}) echo lifting $i cd $i if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$i.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null else echo "Can not find $i/lift/ordered.lft ." endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${i}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` > /dev/null endif cd .. end 'EOF' csh jkStuff/liftToChromOut.csh >&! jkStuff/liftToChromOut.log & tail -100f jkStuff/liftToChromOut.log #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/panTro1 # ./jkStuff/dropSplitTable.csh rmsk hgLoadOut panTro1 ?/*.fa.out ??/*.fa.out # warnings: "Strange perc. field" in several chroms -- occurs w/ negative percent ins. # 1_random, 2, 3, X, featureBits panTro1 rmsk # 1311281819 bases of 3109246583 (42.174%) in intersection featureBits panTro1 rmsk # SIMPLE REPEATS TRACK (DONE 2004-01-21 kate) sh kksilo mkdir -p /cluster/data/panTro1/bed/simpleRepeat cd /cluster/data/panTro1/bed/simpleRepeat mkdir -p trf rm -f jobs.csh echo '#\!/bin/csh -fe' > jobs.csh # create job list of 5MB chunks foreach f \ (/cluster/data/panTro1/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}{,?}.fa \ /cluster/data/panTro1/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}{,?}.fa) set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end chmod +x jobs.csh wc jobs.csh # 773 4634 116496 jobs.csh ./jobs.csh >&! jobs.log & # in bash: ./jobs.csh > jobs.log 2>&1 & tail -f jobs.log # Additional run for chrUn (2004-01-24 kate) # Additional run for chr1_random (2004-01-26 kate) # 27 jobs.chr1_random.csh # When job is done lift output files liftUp simpleRepeat.bed /cluster/data/panTro1/jkStuff/liftAll.lft warn trf/*.bed # Load into the database ssh hgwdev cd /cluster/data/panTro1/bed/simpleRepeat hgLoadBed panTro1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 531061 elements of size 16 featureBits panTro1 simpleRepeat # 53732632 bases of 3109246583 (1.728%) in intersection featureBits panTro1 simpleRepeat # 54320136 bases of 2865248791 (1.896%) in intersectio # MASK 5M PSEUDO-CONTIGS (DONE 2004-01-26 kate) ssh kksilo cd /cluster/data/panTro1 cat > jkStuff/maskContigs.csh << 'EOF' foreach i (?{,?}) cd $i echo masking $i pseudo-contigs foreach j (chr*_*/chr${i}{,_random}_?{,?}.fa) set c = $j:t:r echo masking $j /cluster/bin/i386/maskOutFa $j $j.out $j.rmsk -soft /cluster/bin/i386/maskOutFa $j ../bed/simpleRepeat/trfMask/$c.bed $j.msk -softAdd mv $j $j.nomask mv $j.msk $j /cluster/bin/i386/maskOutFa $j hard $j.hmsk end cd .. end 'EOF' csh jkStuff/maskContigs.csh > jkStuff/maskContigs.log & tail -100f jkStuff/maskContigs.log cat > jkStuff/hardMaskChrom.csh << 'EOF' foreach i (?{,?}) cd $i if (-e chr${i}.fa) then echo hard-masking chr$i.fa /cluster/bin/i386/maskOutFa chr$i.fa hard chr$i.fa.hmsk endif if (-e chr${i}_random.fa) then echo hard-masking chr$i.fa /cluster/bin/i386/maskOutFa chr${i}_random.fa hard \ chr${i}_random.fa.hmsk endif cd .. end 'EOF' csh jkStuff/hardMaskChrom.csh >&! jkStuff/hardMaskChrom.log & tail -100f jkStuff/hardMaskChrom.log # copy to bluearc for cluster runs ssh kksilo cd /cluster/data/panTro1 mkdir -p /cluster/bluearc/panTro1/trfFa foreach d (*/chr*_?{,?}) cp $d/$d:t.fa /cluster/bluearc/panTro1/trfFa end # DOWNLOADS (DONE 2004-02-12 kate) ssh hgwdev cd /cluster/data/panTro1 # preliminary downloads of chromosomes for Evan Eichler's lab # to verify cat > jkStuff/zipChroms.csh << 'EOF' cd /cluster/data/panTro1 set dir = /usr/local/apache/htdocs/goldenPath/panTro1/chromosomes mkdir -p $dir foreach f (?{,?}/*.fa) echo "zipping $f" set b = $f:t:r nice zip -j $dir/${b}.fa.zip $f end 'EOF' csh jkStuff/zipChroms.csh >&! jkStuff/zipChroms.log & tail -100f jkStuff/zipChroms.log cd $dir md5sum *.zip > md5sum.txt cp ../../panTro1/chromosomes/README.txt . # edit # MORE HERE - 2/3/04 angie ssh kksilo cd /cluster/data/panTro1 mkdir zip cat << '_EOF_' > jkStuff/zipOut.csh rm -rf zip mkdir zip zip -j zip/chromOut.zip */chr*.fa.out '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipOut.csh |& tee zip/zipOut.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/panTro1/zip set gp = /usr/local/apache/htdocs/goldenPath/panTro1 # create README.txt mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips cd $gp/bigZips md5sum *.zip > md5sum.txt # remake chrom quality files (bug fixed in chimpChromQuals that # left out trailing zeroes for final gap) # NOTE: not reloading quality track at this point # (will do this later, if needed) # 2004-07-06 kate ssh kksilo cd /cluster/data/panTro1 cat > jkStuff/chromQualsRedo.csh << 'EOF' foreach c (?{,?}) echo "chr$c quality scores" if (-e $c/chr$c.agp) then chimpChromQuals $c/chr$c.agp scaffolds.qac $c/chr$c.qac endif if (-e $c/chr${c}_random.agp) then chimpChromQuals $c/chr${c}_random.agp \ scaffolds.qac $c/chr${c}_random.qac endif end 'EOF' # csh jkStuff/chromQualsRedo.csh >& jkStuff/chromQualsRedo.log & tail -100f jkStuff/chromQualsRedo.log # quality scores ssh kksilo cd /cluster/data/panTro1 cat << '_EOF_' > jkStuff/zipQuals.csh foreach f (?{,?}/*.qac) echo $f set b = $f:r echo $b qacToQa $f $b.qa end rm -rf zip mkdir zip echo "zipping zip/chromQuals.zip" zip -j zip/chromQuals.zip ?{,?}/chr*.qa '_EOF_' # << this line makes emacs coloring happy csh jkStuff/zipQuals.csh >&! zip/zipQuals.log & tail -100f zip/zipQuals.log #- Look at zipQuals.log to make sure all file lists look reasonable. # cleanup uncompressed quality files rm (?{,?}/chr*.qac) #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/panTro1/zip set gp = /usr/local/apache/htdocs/goldenPath/panTro1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips cd $gp/bigZips md5sum *.zip > md5sum.txt # other downloadables ssh kksilo cd /cluster/data/panTro1 cat << '_EOF_' > jkStuff/zipMost.csh rm -rf zip mkdir zip zip -j zip/chromAgp.zip ?{,?}/chr*.agp zip -j zip/chromFa.zip ?{,?}/chr?{,?}{,_random}.fa zip -j zip/chromFaMasked.zip ?{,?}/chr*.fa.hmsk zip -j zip/scaffolds.lft.zip jkStuff/scaffolds.lft '_EOF_' # << this line makes emacs coloring happy csh jkStuff/zipMost.csh >&! zip/zipMost.log & tail -100f zip/zipMost.log #- Look at zipQuals.log to make sure all file lists look reasonable. #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/panTro1/zip set gp = /usr/local/apache/htdocs/goldenPath/panTro1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips cd $gp/bigZips md5sum *.zip > md5sum.txt # AUTO UPDATE GENBANK MRNA RUN (IN PROGRESS - 2004-01-23 - kate) # distribute masked nibs to cluster disks ssh kk mkdir -p /scratch/chimp/panTro1/nib cp /cluster/data/panTro1/nib/chr*.nib /scratch/chimp/panTro1/nib ssh eieio cd /cluster/store5/genbank # This is a new organism, edit the etc/genbank.conf file and add: # chimp panTro1.genome = /scratch/chimp/panTro1/nib/chr*.nib panTro1.lift = /cluster/store6/panTro1/jkStuff/liftAll.lft panTro1.downloadDir = panTro1 panTro1.genbank.est.xeno.load = yes pantro1.refseq.mrna.xeno.load = yes # NOTE: Mark D must add the new species to the auto-update code ssh eieio cd /cluster/bluearc/genbank/work nice bin/gbAlignStep -iserver=no \ -srcDb=genbank -type=mrna -verbose=1 -initial panTro1 # Completed: 2945 of 2945 jobs # CPU time in finished jobs: 21679s 361.32m 6.02h 0.25d 0.001 y # IO & Wait Time: 14040s 234.00m 3.90h 0.16d 0.000 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest job: 65s 1.08m 0.02h 0.00d # Submission to last job: 73s 1.22m 0.02h 0.00d # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1 # To get this next one started, the above results need to be # moved out of the way. These things can be removed if there are # no problems to debug ssh eieio cd /cluster/bluearc/genbank/work mv initial.panTro1 initial.panTro1.genbank.mrna cd /cluster/data/genbank nice bin/gbAlignStep -iserver=no \ -srcDb=refseq -type=mrna -verbose=1 -initial panTro1 ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 panTro1 # the big EST alignment job ssh eieio cd /cluster/bluearc/genbank/work mv initial.panTro1 initial.panTro1.refSeq.mrna cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial panTro1 # recovery ssh kk cd /cluster/bluearc/genbank/work/initial.panTro1/align para push ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 \ -initial -continue=finish -iserver=no panTro1 ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1 # GENSCAN PREDICTIONS (IN PROGRESS - 2004-02-11 kate) ssh hgwdev mkdir -p /cluster/data/panTro1/bed/genscan cd /cluster/data/panTro1/bed/genscan cvs co hg3rdParty/genscanlinux ssh kksilo cd /cluster/data/panTro1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir -p gtf pep subopt # Generate a list file, genome.list, of all the pseudo-contigs ls -1S /cluster/data/panTro1/?{,?}/chr*_*/chr*_*.fa.hmsk > genome.list # don't use big cluster, due to limitation of memory and swap space on each # processing node). ssh kk9 cd /cluster/data/panTro1/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/home/kate/bin/RH7/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList # 568 jobs written to batch para try para check para push # DONE TO HERE # NOTE: 2 jobs (chr6_7, chr19_10) failed with: # "No overlap between a and b in mergeTwo" -- investigating, # but continuing with the track # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/panTro1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/panTro1/bed/genscan ldHgGene panTro1 genscan genscan.gtf # Read 41471 transcripts in 303679 lines in 1 files hgPepPred panTro1 generic genscanPep genscan.pep hgLoadBed panTro1 genscanSubopt genscanSubopt.bed # Loaded 535541 elements of size 6 # CPGISLANDS (DONE - 2004-01-26 - kate) ssh kksilo mkdir -p /cluster/data/panTro1/bed/cpgIsland cd /cluster/data/panTro1/bed/cpgIsland # Copy program as built for previous hg build: mkdir cpg_dist cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist # This step used to read, but I do not immediately see the .tar # file anywhere: (there is a copy in ~/rn3/bed/cpgIsland) # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory # tar xvf cpg_dist.tar # cd cpg_dist # gcc readseq.c cpg_lh.c -o cpglh.exe # cd .. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. cat > ../../jkStuff/cpg.csh << 'EOF' foreach f (../../?{,?}/chr?{,?}{,_random}.fa.hmsk) set fout=$f:t:r:r.cpg echo producing $fout... ./cpg_dist/cpglh.exe $f > $fout end 'EOF' csh ../../jkStuff/cpg.csh >&! ../../jkStuff/cpg.log & tail -100f ../../jkStuff/cpg.log # takes ~40 minutes cat << '_EOF_' > filter.awk /* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */ /* Transforms to: (tab separated columns above, spaces below) */ /* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */ { width = $3-$2; printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n", $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);} '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/panTro1/bed/cpgIsland hgLoadBed panTro1 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed # stringTab = 1 # Reading cpgIsland.bed # Loaded 27596 elements # Sorted # Saving bed.tab # Loading panTro1 # GC 5 BASE WIGGLE TRACK (DONE - 2004-01-28 - Hiram) # working on kksilo, the fileserver for this data. ssh kksilo mkdir /cluster/data/panTro1/bed/gc5Base cd /cluster/data/panTro1/bed/gc5Base mkdir wigData5 dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 panTro1 ../../nib | grep -w GC | \ awk ' { bases = $3 - $2 perCent = $5/10.0 printf "%d\t%.1f\n", $2+1, perCent }' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \ -wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c} echo "done ${c}" done # data is compete, load track on hgwdev ssh hgwdev cd /cluster/data/panTro1/bed/gc5Base hgLoadWiggle panTro1 gc5Base wigData5/*.wig # create symlinks for .wib files mkdir /gbdb/panTro1/wib ln -s `pwd`/wigData5/*.wib /gbdb/panTro1/wib # the trackDb entry track gc5Base shortLabel GC Percent longLabel GC Percent in 5 base windows group map priority 23.5 visibility hide autoScale Off maxHeightPixels 128:36:16 graphTypeDefault Bar gridDefault OFF windowingFunction Mean color 0,128,255 altColor 255,128,0 viewLimits 30:70 type wig 0 100 # GENERATE OOC FILE FOR CHIMP MRNA ALIGNMENTS (DONE 2004-01-30 kate) # Note: have MarkD add ooc file to scripts, and redo chimp mrna alignments ssh kksilo cd /cluster/data/panTro1 ls ?{,?}/*.fa > jkStuff/chrom.lst blat jkStuff/chrom.lst jkStuff/ooc.txt jkStuff/ooc.psl \ -tileSize=11 -makeOoc=11.ooc -repMatch=1024 cp 11.ooc /cluster/bluearc/hg/h/chimp11.ooc /cluster/bluearc/panTro1/11.ooc # redo chimp mrna's ssh eieio cd /cluster/bluearc/genbank/work mv initial.panTro1 initial.panTro1.genbank.est cd /cluster/data/genbank nice bin/gbAlignStep -iserver=no \ -srcDb=genbank -type=mrna -verbose=1 -initial panTro1 # Load the results ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1 # restart EST's (process dropped out for some reason) ssh eieio cd /cluster/bluearc/genbank/work mv initial.panTro1 initial.panTro1.genbank.mrna2 mv initial.panTro1.genbank.est initial.panTro1 cd /cluster/data/genbank nice bin/gbAlignStep -iserver=no -continue=finish \ -srcDb=genbank -type=est -verbose=1 -initial panTro1 # CREATING QUALITY SCORE TRACK # Create compressed scaffold quality file ssh kksilo cd /cluster/data/panTro1 zcat contigs.quals.gz | qaToQac stdin stdout | \ chimpChromQuals assembly.agp stdin scaffolds.qac # qaToQac contigs.quals stdout | chimpChromQuals assembly.agp stdin scaffolds.qac mkdir -p bed/qual/{wigData,dataLimits} cat > jkStuff/chromQuals.csh << 'EOF' foreach c (?{,?}) echo "chr$c quality scores" if (-e $c/chr$c.agp) then chimpChromQuals $c/chr$c.agp scaffolds.qac $c/chr$c.qac qacToWig $c/chr$c.qac -name=chr$c stdout | \ wigAsciiToBinary -dataSpan=1 -chrom=chr$c \ -wibFile=bed/qual/wigData/qual_$c -name=$c stdin > \ bed/qual/dataLimits/chr$c endif if (-e $c/chr${c}_random.agp) then chimpChromQuals $c/chr${c}_random.agp \ scaffolds.qac $c/chr${c}_random.qac qacToWig $c/chr${c}_random.qac -name=chr${c}_random stdout | \ wigAsciiToBinary -dataSpan=1 -chrom=chr${c}_random \ -wibFile=bed/qual/wigData/qual_${c}r -name=${c}r stdin > \ bed/qual/dataLimits/chr${c}_random endif end 'EOF' csh jkStuff/chromQuals.csh >& jkStuff/chromQuals.log & tail -100f jkStuff/chromQuals.log # takes 3-4 hours # Note: can verify size of chrom qual files by comparing wc to # chrom length -- subtracting out length of trailing gap when doing this # as chimpChromQuals doesn't add quality scores for this # load wiggle into database ssh hgwdev cd /cluster/data/panTro1/bed/qual hgLoadWiggle panTro1 quality wigData/*.wig # create symlinks for .wib files mkdir /gbdb/panTro1/wib ln -s `pwd`/wigData/*.wib /gbdb/panTro1/wib # the trackDb entry track quality shortLabel Quality Scores longLabel Quality Scores group map priority 23.6 visibility hide autoScale Off maxHeightPixels 128:36:16 graphTypeDefault Bar gridDefault OFF color 0,128,255 altColor 255,128,0 type wig 0 50 # This wiggle track needs some zoom tables to aid display on whole # chromosome views ssh kksilo cd /cluster/data/panTro1 mkdir -p bed/qual/wigData1K mkdir -p bed/qual/dataLimits1K cat << '_EOF_' > jkStuff/qualZoom1K.sh #!/bin/sh for c in ? ?? do if [ -f $c/chr${c}.qac ]; then echo "chr${c} quality 1K zoom" qacToWig $c/chr${c}.qac -name=chr${c} stdout | wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \ -chrom=chr${c} -wibFile=bed/qual/wigData1K/qc1K_${c} \ -name=${c} stdin > bed/qual/dataLimits1K/chr${c} fi if [ -f $c/chr${c}_random.qac ]; then echo "chr${c}_random quality 1K zoom" qacToWig $c/chr${c}_random.qac -name=chr${c}_random stdout | wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \ -chrom=chr${c}_random -wibFile=bed/qual/wigData1K/qc1K_${c}r \ -name=${c}_random stdin > bed/qual/dataLimits1K/chr${c}r fi done '_EOF_' chmod +x ./jkStuff/qualZoom1K.sh time ./jkStuff/qualZoom1K.sh # real 227m8.064s # user 165m8.720s # sys 3m58.780s ssh hgwdev cd bed/qual/wigData1K # Do not need these for chrom M rm -f qc1K_M.wig qc1K_M.wib qc1K_Mr.wig qc1K_Mr.wib hgLoadWiggle -oldTable panTro1 quality *.wig # create symlinks for .wib files ln -s `pwd`/*.wib /gbdb/panTro1/wib # HUMAN ALIGNMENTS (DONE 2004-02-11 kate) ssh kksilo cd /cluster/data/panTro1 # start with full chimp-reference chains from # December Human/Chimp alignments (hg16/pt0, blastz+blat) mkdir -p bed/blastz-blatHg16.pt0.swap cd bed/blastz-blatHg16.pt0.swap cp /cluster/data/pt0/bed/blastz-blatHg16/all.scaffold.chain . # lift to chimp chromosome coordinates # (need latest liftUp) ~kate/bin/i386/liftUp all.chain ../../jkStuff/scaffolds.lft \ warn all.scaffold.chain # rechain to catch alignments spanning scaffold boundaries ls -1S /cluster/data/panTro1/nib/*.nib > chimpNib.lst ls -1S /cluster/data/hg16/nib/*.nib > humanNib.lst chainToPsl all.chain ../../chrom.sizes /cluster/data/hg16/chrom.sizes \ chimpNib.lst humanNib.lst all.chain.psl # in retrospect, I would have done a chainSplit first... pslSortAcc nohead pslChain temp all.chain.psl # Processed 9254050 lines into 38 temp files mkdir -p pslChain/run cd pslChain/run mkdir out chain ssh kk cd /cluster/data/panTro1 cd blastz-blatHg16.pt0.swap/pslChain/run # copy nibs to iservers (seem to have disappeared from /scratch/chimp) mkdir -p /iscratch/i/chimp/panTro1/nib cp /cluster/bluearc/scratch/chimp/panTro1/nib/* \ /iscratch/i/chimp/panTro1/nib iSync ls -1S /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/pslChain/*.psl \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -psl $1 \ /scratch/chimp/panTro1/nib \ /iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # 52 jobs para try para push # merge chains to give unique chain ID's in preparation for netting ssh kksilo cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap chainMergeSort pslChain/run/chain/*.chain | chainSplit netChain stdin # NOTE: next time, save the merged chain file for alter use # (e.g. netChainSubset will need it) # load chains into browser ssh hgwdev cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/netChain foreach i (*.chain) set c = $i:r hgLoadChain panTro1 ${c}_chainHg16 $i echo done $c end # net the chains ssh kksilo cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/netChain # needs Angie's mod to chainNet to suppress info messages cat > net.csh << 'EOF' chainMergeSort -saveId chr*.chain | \ chainPreNet stdin \ /cluster/data/panTro1/chrom.sizes \ /cluster/data/hg16/chrom.sizes stdout | \ ~kate/bin/i386/chainNet stdin -minSpace=1 \ /cluster/data/panTro1/chrom.sizes \ /cluster/data/hg16/chrom.sizes stdout /dev/null | \ netSyntenic stdin hNoClass.net 'EOF' # << this line makes emacs coloring happy csh net.csh >&! net.log & tail -100f net.log # create all.chain file with new ID's, for swapping to human browser chainMergeSort -saveId chr*.chain > ../all.newId.chain ssh hgwdev cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/netChain netClass hNoClass.net panTro1 hg16 ../human.net # load net into the browser ssh hgwdev cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap hgLoadNet -warn panTro1 netHg16 human.net rm -r netChain/hNoClass.net # 8/23/04 move aside pre-re-chaining files to avoid confusion (angie) mkdir preChain mv all.scaffold.chain all.chain humanNib.lst chimpNib.lst all.chain.psl \ preChain/ # CHAIN FILES FROM THE NET (DONE kate) # for downloads area ssh kksilo cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap chainMergeSort -saveId *.chain | \ netChainSubset human.net stdin netChain/human.net.chain chainSplit netChainSubset netChain/human.net.chain ssh hgwdev cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/netChainSubset set dir = /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/netChain # add README to vsHg16 dir. mkdir -p $dir cp *.chain $dir cd $dir gzip *.chain md5sum *.gz > md5sum.txt cp ~/kent/src/hg/mouseStuff/chainFormat.doc . # AXT FILES FROM THE NET (DONE kate) # for a "Human Best" track that displays alignments at # base level, and for download ssh kksilo cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap mkdir net axtNet netSplit human.net net cd axtNet cat > makeAxt.csh << 'EOF' cd ../net foreach f (*.net) set i = $f:r echo $i.net netToAxt $i.net ../netChain/$i.chain \ /cluster/data/panTro1/nib /cluster/data/hg16/nib \ ../axtNet/$i.axt end 'EOF' # << this line makes emacs coloring happy csh -f makeAxt.csh >&! makeAxt.log & tail -100f makeAxt.log # save in /gbdb and load into database ssh hgwdev mkdir -p /gbdb/panTro1/axtNetHg16 cd /gbdb/panTro1/axtNetHg16 foreach f (/cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/axtNet/*.axt) ln -s $f /gbdb/panTro1/axtNetHg16 end cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/axtNet hgLoadAxt panTro1 axtNetHg16 # copy to download area ssh kksilo cd /cluster/data/panTro1 cd bed/blastz-blatHg16.pt0.swap/axtNet ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/axtNet cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/axtNet cp /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/axtNet/*.axt . gzip *.axt md5sum *.gz > md5sum.txt # create README.txt # add to axtInfo table for Comparative Sequence link on gene pred tracks # (2004-04-27 kate) ssh hgwdev hgsql panTro1 < ~/kent/src/hg/lib/axtInfo.sql foreach f (/gbdb/panTro1/axtNetHg16/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg16','Reciprocal Best','$chr','$f');" | hgsql panTro1 end # Genbank ESTs completed, but needed to redo native ESTs/mRNAs with # new ooc file. # Remove native ESTs: cd /cluster/data/genbank find aligned/genbank.139.0/panTro1 -name 'est.*.native.*' |xargs rm -f # realign native ESTs nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial panTro1& # Reload mRNAs and load ESTS ssh hgwdev nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1 # HUMAN REFSEQ MAPPINGS (Writeup incomplete, in progress jk) liftOver ../../../bedOverHg16/refGene2/hg16.refGene.gtf ../../humanTarget.chain new.gtf unmapped.gtf -minMatch=0.5 -gff ldHgGene panTro1 mappedRefSeq new.gtf # HUMAN DELETIONS (kpollard, finished 2/19/04) # 80-12000 bp indels in human/chimp alignments (from Tarjei's files) #data mkdir -p /cluster/data/panTro1/bed/indels cd /cluster/data/panTro1/bed/indels #ftp the file indels.human.chimp.tar.gz from ftp.broad.mit.edu gunzip indels.human.chimp.tar.gz tar -xvf indels.human.chimp.tar mkdir arachne mv indels_arachne.* arachne gzip arachne/indels_arachne.chimp.fa gzip arachne/indels_arachne.human.fa #make .bed files from Tarjei's .fa files cluster/bin/i386/faSize detailed=on indels.chimp.fa > chimp.all.txt /cluster/bin/i386/faSimplify -prefix="scaffold_" indels.chimp.fa _ , temp.fa /cluster/bin/i386/faSize detailed=on temp.fa > chimp.scaff.txt cat /cluster/data/panTro1/jkStuff/scaffolds.lft | gawk '{print $2"\t"$6}' > scaffStrand.txt R #commands in R scaff<-read.table("chimp.scaff.txt") #read in scaffold and size scaff.info<-read.table("scaffStrand.txt") #read in scaffold strand info all<-read.table("chimp.all.txt") #read in full .fa header and size long<-as.character(all[,1]) #extract full .fa header longsplit<-matrix(unlist(strsplit(long,",")),nc=4,byrow=TRUE) #split end<-cbind(as.numeric(longsplit[,4]),as.numeric(all[,2])) #end and size both<-cbind(scaff,end) #concatenate scaff size end size sum(both[,2]!=both[,4]) #check sizes agree dimnames(both)<-list(1:18395,c("scaff","size","start","stop")) #name columns of both dimnames(scaff.info)<-list(1:37931,c("scaff","strand")) #name rows and columns of scaff.info both.strand<-merge(both,scaff.info,by="scaff",sort=FALSE) #merge scaff.info and both based on the column scaff attach(both.strand) #attach dataframe out<-both.strand #create data set out out[strand=="+","stop"]<-start[strand=="+"]+size[strand=="+"] #stop for + strands out[strand=="-","stop"]<-start[strand=="-"]-size[strand=="-"] #stop for - strands out[strand=="-",]<-out[strand=="-",c(1,2,4,3,5)] #rearrange columns for - strands so start comes before stop out<-out[,c(1,3,4,2)] #reorder columns: scaff start stop size out[,4]<-paste("HD",1:length(out[,4]),"_",both[,4],sep="") #make name like HDN_size write(t(out),"indels.chimp.new.bed",ncol=4) q() #back in shell: map to chromosomes cat indels.chimp.new.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.chimp.tab.bed /cluster/bin/i386/liftUp -type=.bed indels.chimp.chr.bed /cluster/data/panTro1/jkStuff/scaffolds.lft warn indels.chimp.tab.bed #load track into chimp browser mkdir -p /gbdb/panTro1/humanDels ln -s /cluster/data/panTro1/bed/indels/indels.chimp.chr.bed /gbdb/panTro1/humanDels cd /cluster/data/panTro1/bed/indels /cluster/bin/i386/hgLoadBed panTro1 humanDels indels.chimp.chr.bed #add description file humanDels.html # to ~/kent/src/hg/makeDb/trackDb/chimp/panTro1 #add a track entry to trackDb.ra # in ~/kent/src/hg/makeDb/trackDb/chimp/panTro1 # HUMAN REFSEQ ALIGNMENTS (markd) Added hack in genbank alignment code so that the xeno refseq track consists only of human refseqs. This had to be reloaded due to query type getting lost. # TWINSCAN (genes: baertsch 2004-03-11, proteins: kate 2004-04-22) # load twinscan predictions without filtering pseudogenes baertsch 3/11/04 # loaded frame info cd /cluster/data/panTro1/bed mkdir -p twinscan cd twinscan wget http://genes.cs.wustl.edu/predictions/chimp/panTro1_vs_mm4/chimp_panTro1_vs_mm4.tgz tar xvfz *.tgz cd chr_gtf # load gene predictions ldHgGene panTro1 twinscan chr*.gtf -gtf -frame -geneName # load predicted proteins # pare down protein FASTA header to id and add missing .a: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y) echo chr$c perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa end hgPepPred panTro1 generic twinscanPep chr_ptx/chr*-fixed.fa # GENEID GENES (2004-03-16 kate) ssh hgwdev mkdir -p /cluster/data/panTro1/bed/geneid/download cd /cluster/data/panTro1/bed/geneid/download # Now download *.gtf and *.prot from set dir = genome.imim.es/genepredictions/P.troglodytes/golden_path_200311/geneid_v1.1/ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y Un) wget http://$dir/chr$c.gtf wget http://$dir/chr${c}_random.gtf wget http://$dir/chr$c.prot wget http://$dir/chr${c}_random.prot end wget http://$dir/readme # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot echo "done $f" end cd .. #ldHgGene panTro1 geneid download/*.gtf -exon=CDS #Read 34198 transcripts in 279022 lines in 51 files # Compare to Hg16 was: Read 32255 transcripts in 281180 lines # in 40 files #34198 groups 51 seqs 1 sources 3 feature types #34198 gene predictions ldHgGene panTro1 geneid download/*.gtf -gtf hgPepPred panTro1 generic geneidPep download/*-fixed.prot # reload chr1 (bad original data) (2004-04-06 kate) set dir = genome.imim.es/genepredictions/P.troglodytes/golden_path_200311/geneid_v1.1/ set c = 1 wget http://$dir/chr$c.gtf wget http://$dir/chr$c.prot set f = chr1.prot perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot cd .. ldHgGene panTro1 geneid download/*.gtf -gtf # 32432 gene predictions hgPepPred panTro1 generic geneidPep download/*-fixed.prot # REVISED HUMAN DELETIONS (kpollard, 4/13/04) # new file emailed from Tarjei: humanDels.new.dat # with corrected coordinates (based on chimp agp) # saved in /cluster/data/panTro1/bed/indels cd /cluster/data/panTro1/bed/indels #has chr start end, need to add names pr -t -n humanDels.new.dat > temp cat temp | gawk '{size=$4 - $3; print $2"\t"$3"\t"$4"\t""HD"$1"_"size}' > humanDels.new.bed #load track into chimp browser ln -s /cluster/data/panTro1/bed/indels/humanDels.new.bed /gbdb/panTro1/humanDels hgsql panTro1 delete from humanDels; exit cd /cluster/data/panTro1/bed/indels /cluster/bin/i386/hgLoadBed panTro1 humanDels humanDels.new.bed # BAC Ends (2004-04-22 kate) # download BAC ends from NCBI. The chimp set appear to # be from Riken, Japan (source "djb", databank japan). # Ref: Fujiyama, et. al, Science Jan 4, 2002 ssh eieio mkdir -p /cluster/data/ncbi/bacends/chimp cd /cluster/data/ncbi/bacends/chimp mkdir bacends.chimp.1 cd bacends.chimp.1 wget ftp.ncbi.nih.gov/genomes/BACENDS/pan_troglodytes/AllBACends.mfa.gz wget ftp.ncbi.nih.gov/genomes/BACENDS/pan_troglodytes/cl_acc_gi_len.gz # problems with wget -- downloaded from browser gunzip *.gz # create pairs info file /cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len # 78846 pairs and 896 singles # create sequence file # edit convert.pl to allow "dbj" source instead of "gb" # (data bank japan vs. genbank) cp /cluster/store1/bacends.4/convert.pl . convert.pl < AllBACends.mfa > BACends.fa faCount BACends.fa ssh hgwdev mkdir -p /gbdb/panTro1/bacends cd /gbdb/panTro1/bacends ln -s /cluster/data/ncbi/bacends/chimp/bacends.chimp.1/BACends.fa BACends.chimp.fa # 123470915 bases, 158588 sequences # split to cluster dir -- break into 10 files for a totol # of 10 * 30K (#scaffolds) = 300K jobs ssh eieio cd /cluster/data/ncbi/bacends/chimp/bacends.chimp.1 mkdir -p /iscratch/i/chimp/bacends.1 # split sequence file and copy for cluster access faSplit sequence BACends.fa 10 /iscratch/i/chimp/bacends.1/ ssh kkr1u00 /cluster/bin/iSync # split scaffolds file for cluster access cd /cluster/bluearc/panTro1 mkdir -p scaffolds faSplit byName scaffolds.msk.fa scaffolds/ # blat vs. unmasked scaffolds # use /cluster/bluearc/booch/bacends/Makefile # edit bacends.lst ssh kk cd /cluster/data/panTro1/bed/bacends mkdir run cd run ls -1S /cluster/bluearc/panTro1/scaffolds | sed 's.^./cluster/bluearc/panTro1/scaffolds/.' > scaffolds.lst ls /iscratch/i/chimp/bacends.1/*.fa > bacends.lst mkdir ../out cat > gsub << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/panTro1/bed/bacends/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' gensub2 scaffolds.lst bacends.lst gsub jobList foreach f (`cat bacends.lst`) set d = $f:r:t echo $d mkdir -p /cluster/data/panTro1/bed/bacends/out/$d end para create jobList # 379310 jobs para try para check para push # Note -- these run quickly! # Longest job: 595s 9.92m # lift alignments ssh kksilo cd /cluster/data/panTro1/bed/bacends pslSort dirs raw.psl temp out/?? pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds.psl /dev/null #Processed 16577599 alignments mkdir lifted liftUp lifted/bacEnds.lifted.psl \ /cluster/data/panTro1/jkStuff/scaffolds.lft warn bacEnds.psl pslSort dirs bacEnds.sorted.psl temp lifted rmdir temp # 520377 bacEnds.sorted.psl set ncbiDir = /cluster/data/ncbi/bacends/chimp/bacends.chimp.1 pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=600000 -slopval=10000 -hardMax=750000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds 26805 bacEnds.pairs # 96 w/ score=750 982 bacEnds.short 9 bacEnds.long 1179 bacEnds.slop 29071 bacEnds.orphan 190 bacEnds.mismatch # 158588 sequences # compared to hg16 203386 bacEnds.pairs # 1524 w/ score=750 # 396 w/ score=500 # 234 w/ score=250 # 96 w/ score=125 6839 bacEnds.short 57 bacEnds.long 4388 bacEnds.slop 70851 bacEnds.orphan 1404 bacEnds.mismatch # 603416 sequences # TODO: replace w/ awk & sort # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed # note: need "rdb" # note: tune pslPairs variables for draft chimp # ~150Kbp total length of clone for BAC's # 50Kbp - 600Kbp for early human # 25Kbp - 350Kbp for finished human # minimize # falling into slop, short, long # try hardMax 750K. tryout min/max variants # try to get as many as possible, uniquely # note - this track isn't pushed to RR, just used for assembly QA # create table of all bac end alignments extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \ bacEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/panTro1/bed/bacends hgLoadBed panTro1 bacEndPairs bacEndPairs.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql # Loaded 26805 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed panTro1 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 31396 #hgLoadPsl panTro1 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used hgLoadPsl panTro1 -table=all_bacends bacEnds.load.psl #load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab # load BAC end sequences #hgLoadSeq panTro1 /gbdb/ncbi/bacends/chimp/bacends.chimp.1/BACends.chimp.fa hgLoadSeq panTro1 /gbdb/panTro1/bacends/BACends.chimp.fa # 158588 sequences # for analysis, you might want to run: make bacEndBlock.bed featureBits panTro1 bacEndPairs.bed \!gap -bed=bacEndBlocks.bed -noRandom # 1791977681 bases of 2412409802 (74.282%) in intersection # human hg16: 2827808812 bases of 2843433602 (99.450%) in intersection featureBits panTro1 bacEndPairs.bed -bed=bacEndBlocksGap.bed -noRandom # 2007151617 bases of 2412409802 (83.201%) in intersection # human hg16: 2836036212 bases of 2843433602 (99.740%) in intersection # added searchSpec for bacEndPairs to include chimp clone prefixes # CONTIGS TRACK (2004-04-23 kate) # make ctgPos (contig name, size, chrom, chromStart, chromEnd) from lift ssh kksilo cd /cluster/data/panTro1/bed mkdir ctgPos cd ctgPos awk 'BEGIN {OFS="\t"} {print $4, $1, $3+$1, $2}' ../../assembly.lft \ > contig-scaffold.bed liftUp contig-chrom.bed ../../jkStuff/scaffolds.lft warn contig-scaffold.bed awk 'BEGIN {OFS="\t"} {print $4, $3-$2, $1, $2, $3}' contig-chrom.bed \ > ctgPos.tab ssh hgwdev cd /cluster/data/panTro1/bed/ctgPos echo "drop table ctgPos" | hgsql panTro1 cat ~/kent/src/hg/lib/ctgPos.sql \ | sed 's/contig(12)/contig/' | hgsql panTro1 #hgsql panTro1 < ~/kent/src/hg/lib/ctgPos.sql echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql panTro1 # FUGU BLAT ALIGNMENTS (2004-04-27 kate) # reloaded table w/ new index 2004-05-13 kate) ssh kk mkdir /cluster/data/panTro1/bed/blatFr1 cd /cluster/data/panTro1/bed/blatFr1 ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /scratch/chimp/panTro1/nib/*.nib > chimp.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 chimp.lst fugu.lst gsub spec para create spec para try, check, push, check, ... #CPU time in finished jobs: 6324220s 105403.66m 1756.73h 73.20d 0.201 y #IO & Wait Time: 2344146s 39069.11m 651.15h 27.13d 0.074 y #Average job time: 288s 4.81m 0.08h 0.00d #Longest job: 14817s 246.95m 4.12h 0.17d #Submission to last job: 51381s 856.35m 14.27h 0.59d # Sort alignments: ssh kksilo cd /cluster/data/panTro1/bed/blatFr1 pslCat -dir psl | pslSortAcc nohead chrom temp stdin # Processed 782670 lines into 4 temp files # lift query side to Fugu browser chrUn coordinates liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl # load into database: ssh hgwdev cd /cluster/data/panTro1/bed/blatFr1 hgLoadPsl -fastLoad -table=blatFr1 panTro1 all.psl # load problem with 2 rows -- qBaseInsert values were negative, # and were set to 0 when loaded. Blat problem ? # add to trackDb as type xeno psl fr1, with colorChromDefault off # Reciprocal Best chain files for download # just swap the human-reference file generated in makeHg16.doc ssh kksilo cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap chainSwap /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBest.chain \ stdout | chainMergeSort stdin > rBest.chain ssh hgwdev cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap set dir = /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16 mkdir -p $dir cp -p rBest.chain $dir/chimp.best.chain cd $dir gzip *.chain # create md5sum.txt and README's # split and load into table for QA purposes ssh kksilo cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap chainSplit rBest rBest.chain ssh hgwdev cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap cd rBest foreach i (*.chain) set c = $i:r hgLoadChain panTro1 ${c}_rBestChainHg16 $i echo done $c end # LOAD ENSEMBL GENES (DONE 6/16/04 angie) ssh kksilo mkdir /cluster/data/panTro1/bed/ensembl cd /cluster/data/panTro1/bed/ensembl # Get the ensembl gene data from # http://www.ensembl.org/Pan_troglodytes/martview # Follow this sequence through the pages: # Page 1) Make sure that the Pan_troglodytes choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensembl.gff.gz # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. gunzip -c ensembl.gff.gz \ | perl -wpe 's/^([0-9]+|E\w+|[XY]|Un(_random)?)/chr$1/ \ || die "Line $. doesnt start with chimp chrom:\n$_"' \ > ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/^(>ENSPTRT\d+\.\d+).*/$1/' \ > ensPep.fa # load up: ssh hgwdev cd /cluster/data/panTro1/bed/ensembl ldHgGene -gtf -genePredExt panTro1 ensGene \ /cluster/data/panTro1/bed/ensembl/ensGene.gtf hgsql panTro1 < ~/kent/src/hg/lib/ensGtp.sql hgsql panTro1 -e 'load data local infile "ensGtp.txt" into table ensGtp' hgPepPred panTro1 generic ensPep ensPep.fa # MAP ENCODE ORTHOLOG REGIONS -- FREEZE 1 (DONE 2004-07-09 kate) ssh kksilo mkdir /cluster/data/panTro1/bed/encodeRegions cd /cluster/data/panTro1/bed/encodeRegions ssh kksilo mkdir /cluster/data/panTro1/bed/encodeRegions cd /cluster/data/panTro1/bed/encodeRegions # at .60, mapped 42, at .50 mapped all 44 liftOver -minMatch=.50 \ /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain \ encodeRegions.bed encodeRegions.unmapped wc -l * # 44 encodeRegions.bed # 0 encodeRegions.unmapped # 44 total ssh hgwdev cd /cluster/data/panTro1/bed/encodeRegions hgLoadBed panTro1 encodeRegionsNew encodeRegions.bed -noBin # TODO: remove this stuff # swap hg16->panTro1 chains for display in chimp browser (temporary, for ENCODE development) ssh kolossus cd /cluster/data/hg16/bed/liftOver chainSwap hg16TopanTro1.chain panTro1ToHg16.swp.chain # TODO: remove this stuff ssh hgwdev cd /cluster/data/hg16/bed/liftOver hgLoadChain panTro1 liftOverHg16SwpChain panTro1ToHg16.swp.chain # TODO: remove this stuff # also load "all chains" liftOver chain (not reciprocal best) ssh kolossus cd /cluster/data/hg16/bed/liftOver chainSwap /cluster/data/hg16/blat-blastz.panTro1/over.chain \ panTro1ToHg16.all.swp.chain ssh hgwdev cd /cluster/data/hg16/bed/liftOver hgLoadChain panTro1 liftOverHg16SwpAllChain panTro1ToHg16.all.swp.chain # TODO: remove this stuff too ssh kolossus cd /cluster/data/hg16/bed/liftOver chainSwap /cluster/data/hg16/bed/blastz-blat.panTro1/over.newId.chain \ panTro1ToHg16.newId.swp.chain ssh hgwdev cd /cluster/data/hg16/bed/liftOver hgLoadChain panTro1 liftOverHg16SwpNewIdChain panTro1ToHg16.newId.swp.chain # ACCESSIONS FOR CONTIGS (DONE 2004-08-27 kate) # Used for Scaffold track details and ENCODE AGP's ssh hgwdev cd /cluster/data/panTro1/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc contigAccession chimp.acc > contigAcc.tab wc -l contigAcc.tab # 361864 contigAcc.tab # extract all records from chrN_gold, using table browser # (includes header line) faSize /cluster/data/panTro1/contigs.bases # 361864 hgsql panTro1 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql panTro1 hgsql panTro1 -e "SELECT COUNT(*) FROM contigAcc" # 361864 # HUMAN HG17 NET AXT'S FOR DOWNLOAD (2005-02-11 kate) # By user request ssh kolossus # kksilo currently off-limit for logins cd /cluster/data/panTro1/bed mkdir -p blastz.hg17.swp cd blastz.hg17.swp mkdir axtChain cd axtChain cp /cluster/data/hg17/bed/blastz.panTro1/axtChain/chimp.net . netSplit chimp.net chimpNet chainSwap /cluster/data/hg17/bed/blastz.panTro1/axtChain/all.chain all.chain chainSplit chain all.chain mkdir -p ../axtNet cat > makeAxt.csh << 'EOF' foreach f (chimpNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt chimpNet/$c.net chain/$c.chain /cluster/data/panTro1/nib /cluster/data/hg17/nib stdout | axtSort stdin ../axtNet/$c.axt end 'EOF' csh makeAxt.csh >&! makeAxt.log & tail -100f makeAxt.log cd ../axtNet nice gzip *.axt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/panTro1 mkdir -p vsHg17 cd vsHg17 mkdir -p axtNet nice cp -rp /cluster/data/panTro1/bed/blastz.hg17.swp/axtNet/*.axt.gz axtNet ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzPanTro1.2006-02-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits panTro1 chainMm8Link # 901976621 bases of 2733948177 (32.992%) in intersection ############################################################################ # LIFTOVER CHAINS TO PANTRO2 # Split (using makeLoChain-split) of panTro2 is doc'ed in makePanTro2.doc # Do what makeLoChain-split says to do next (start blat alignment) ssh kk cd /cluster/data/panTro1/bed/liftOver makeLoChain-align panTro1 /scratch/hg/panTro1/nib panTro2 \ /iscratch/i/panTro2/split10k \ /cluster/bluearc/panTro2/11.ooc >&! align.log & # Do what its output says to do next (start cluster job) cd /cluster/data/panTro1/bed/blat.panTro2.2006-04-04/run para shove para time >&! run.time #CPU time in finished jobs: 906023s 15100.39m 251.67h 10.49d 0.029 y #IO & Wait Time: 22074s 367.90m 6.13h 0.26d 0.001 y #Average job time: 343s 5.72m 0.10h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 4260s 71.00m 1.18h 0.05d #Submission to last job: 4965s 82.75m 1.38h 0.06d # lift alignments ssh kkr1u00 cd /cluster/data/panTro1/bed/liftOver makeLoChain-lift panTro1 panTro2 >&! lift.log & # chain alignments ssh kki cd /cluster/data/panTro1/bed/liftOver makeLoChain-chain panTro1 /scratch/hg/panTro1/nib \ panTro2 /scratch/hg/panTro2/nib >&! chain.log & # Do what its output says to do next (start cluster job) cd /cluster/data/panTro1/bed/blat.panTro2.2006-04-04/chainRun para shove para time >&! run.time #CPU time in finished jobs: 3884s 64.73m 1.08h 0.04d 0.000 y #IO & Wait Time: 594s 9.91m 0.17h 0.01d 0.000 y #Average job time: 86s 1.44m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 245s 4.08m 0.07h 0.00d #Submission to last job: 401s 6.68m 0.11h 0.00d # net alignment chains ssh kkstore03 cd /cluster/data/panTro1/bed/liftOver makeLoChain-net panTro1 panTro2 >&! net.log & # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/panTro1/bed/liftOver makeLoChain-load panTro1 panTro2 >&! load.log & # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region ####################################################################### # 2bit file to filesystems for kluster runs (2006-07-11 kate) ssh kkr1u00 mkdir /iscratch/i/panTro1 cd /iscratch/i/panTro1 cp -p /cluster/data/panTro1/panTro1.2bit . cp -p /cluster/data/panTro1/chrom.sizes . foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/panTro1/ kkr${R}u00:/iscratch/i/panTro1/ end # request push to /scratch/hg from cluster admins ? # GOT HERE -- need pk and SAN to be up to complete ssh pk mkdir /san/sanvol1/scratch/panTro1 cd /san/sanvol1/scratch/panTro1 cp -p /cluster/data/panTro1/panTro1.2bit . cp -p /cluster/data/panTro1/chrom.sizes . ####################################################################### # dbSNP BUILD 125 (Heather, August 2006) # Our panTro1 has a ctgPos but it doesn't match the dbSNP ContigInfo, # so I worked just with non-random chrom coords as provided in ContigLoc. # dbSNP is using the new convention for chimp chroms, so I translated them: # chr2A --> chr12 # chr2B --> chr13 # chr3 --> chr2 # chr4 --> chr3 # chr5 --> chr4 # chr6 --> chr5 # chr7 --> chr6 # chr8 --> chr7 # chr9 --> chr11 # chr10 --> chr8 # chr11 --> chr9 # chr12 --> chr10 # chr13 --> chr14 # chr14 --> chr15 # chr15 --> chr16 # chr16 --> chr18 # chr17 --> chr19 # chr18 --> chr17 # chr19 --> chr20 # chr20 --> chr21 # chr21 --> chr22 # chr22 --> chr23 # Set up directory structure ssh kkstore02 cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598 mkdir data mkdir schema mkdir rs_fasta # Get data from NCBI (anonymous FTP) cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/data ftp ftp.ncbi.nih.gov cd snp/organisms/chimpanzee_9598/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b125_SNPContigLoc_1_1.bcp.gz # ContigLocusId has function get b125_SNPContigLocusId_1_1.bcp.gz get b125_ContigInfo_1_1.bcp.gz # MapInfo has alignment weights get b125_SNPMapInfo_1_1.bcp.gz # SNP has univar_id, validation status and heterozygosity get SNP.bcp.gz # Get schema from NCBI cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema ftp ftp.ncbi.nih.gov cd snp/organisms/chimpanzee_9598/database/organism_schema get chimpanzee_9598_table.sql.gz # Get fasta files from NCBI # using headers of fasta files for molType cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/chimpanzee_9598/rs_fasta prompt mget *.gz # add rs_fasta to seq/extFile # 2 edits first: strip header to just rsId, and remove duplicates # work on /cluster/store12 (kkstore05) which has more disk space cp rs_ch*.fas.gz /cluster/store12/snp/125/chimp/rs_fasta ssh kkstore05 cd /cluster/store12/snp/125/chimp/rs_fasta # concat into rsAll.fas cat << '_EOF_' > concat.csh #!/bin/csh -ef rm -f rsAll.fas foreach file (rs_ch*.fas) echo $file zcat $file >> rsAll.fas end '_EOF_' # snpCleanSeq strips the header and skips duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa rm rsAll.fas # load on hgwdev ssh hgwdev mkdir /gbdb/panTro1/snp ln -s /cluster/store12/snp/125/chimp/rs_fasta/snp.fa /gbdb/panTro1/snp/snp.fa cd /cluster/store12/snp/125/chimp/rs_fasta hgLoadSeq panTro1 /gbdb/chimp/snp/snp.fa # look up id in extFile # move into separate table hgsql panTro1 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 179085' panTro1 hgsql -e 'delete from seq where extFile = 179085' panTro1 hgsql -e 'alter table snpSeq add index acc (acc)' panTro1 # clean up after hgLoadSeq rm seq.tab # Simplify names of data files cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/data mv b125_ContigInfo_1_1.bcp.gz ContigInfo.gz mv b125_SNPContigLoc_1_1.bcp.gz ContigLoc.gz mv b125_SNPContigLocusId_1_1.bcp.gz ContigLocusId.gz mv b125_SNPMapInfo_1_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz ls -1 *.gz > filelist # edit table descriptions cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema # get CREATE statements from chimpanzee_9598_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # get header lines from rs_fasta cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta /bin/csh gnl.csh # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database panTro1snp125' cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema hgsql panTro1snp125 < table.sql cd ../data /bin/csh load.csh # note rowcount # ContigLoc 1666593 # SNP 1542718 # MapInfo 1608359 # ContigLocusId 582130 # Get UniVariation from 126 (recently downloaded for mm8snp126) cd /cluster/data/dbSNP/126/shared hgsql panTro1snp125 < UniVariation.sql zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' panTro1snp125 # create working /scratch dir cd /scratch/snp/125 mkdir chimp cd chimp # panTro1.ctgPos table uses different convention # get gnl files cp /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_assembly" # filter ContigLoc into ContigLocFilter # errors reported to stdout # this gets rid of alternate assemblies (using ContigInfo) # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo) # assumes all contigs are positively oriented; will abort if not true mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | start | int(11) | NO | | | | # | end | int(11) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 panTro1snp125 ref_assembly Arachne # note rowcount # ContigLocFilter 1354272 # how many are positive strand? hopefully 90% # We get 100% here mysql> select count(*) from ContigLocFilter where orientation = 0; # 1270940 # note count by loc_type mysql> select count(*), loc_type from ContigLocFilter group by loc_type; # +----------+----------+ # | count(*) | loc_type | # +----------+----------+ # | 1469 | 1 | # | 1350271 | 2 | # | 2532 | 3 | # +----------+----------+ # filter ContigLocusId into ContigLocusIdFilter /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter panTro1snp125 ref_assembly # note rowcount 559975 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order; will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense panTro1snp125 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes) # actually we get about 90% here # ContigLocusIdCondense 539366 # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # we are just using molType, but also storing class and observed # need chromInfo for this # convert to our panTro1 chrom convention /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta panTro1snp125 /bin/csh renameFasta.csh # (could start using pipeline.csh here) # (pipeline.csh takes about 35 minutes to run) # split ContigLocFilter by chrom # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom panTro1snp125 ref_assembly # check coords using loc_type # possible errors logged to snpLocType.error: # Unknown locType # Between with allele != '-' # Exact with end != start + 1 # Range with end < start # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # This run 4 exceptions/errors: couldn't handle N(100) # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 panTro1snp125 ref_assembly # expand allele as necessary # report syntax errors to snpExpandAllele.errors # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # 43 alleles expanded # Couldn't handle 4 that had N(100) # Fixed with revision 1.26 of snpExpandAllele.c and data edit. /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele panTro1snp125 ref_assembly # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort panTro1snp125 ref_assembly # rename MT --> M hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" panTro1snp125 # chrom conversion!! hgsql -e 'rename table chr3_snpTmp to chr2_snpTmp' panTro1snp125 hgsql -e 'rename table chr4_snpTmp to chr3_snpTmp' panTro1snp125 hgsql -e 'rename table chr5_snpTmp to chr4_snpTmp' panTro1snp125 hgsql -e 'rename table chr6_snpTmp to chr5_snpTmp' panTro1snp125 hgsql -e 'rename table chr7_snpTmp to chr6_snpTmp' panTro1snp125 hgsql -e 'rename table chr8_snpTmp to chr7_snpTmp' panTro1snp125 hgsql -e 'rename table chr9_snpTmp to chr11A_snpTmp' panTro1snp125 hgsql -e 'rename table chr10_snpTmp to chr8_snpTmp' panTro1snp125 hgsql -e 'rename table chr11_snpTmp to chr9_snpTmp' panTro1snp125 hgsql -e 'rename table chr11A_snpTmp to chr11_snpTmp' panTro1snp125 hgsql -e 'rename table chr12_snpTmp to chr10_snpTmp' panTro1snp125 hgsql -e 'rename table chr2A_snpTmp to chr12_snpTmp' panTro1snp125 hgsql -e 'rename table chr22_snpTmp to chr23_snpTmp' panTro1snp125 hgsql -e 'rename table chr21_snpTmp to chr22_snpTmp' panTro1snp125 hgsql -e 'rename table chr20_snpTmp to chr21_snpTmp' panTro1snp125 hgsql -e 'rename table chr19_snpTmp to chr20_snpTmp' panTro1snp125 hgsql -e 'rename table chr17_snpTmp to chr19_snpTmp' panTro1snp125 hgsql -e 'rename table chr18_snpTmp to chr17_snpTmp' panTro1snp125 hgsql -e 'rename table chr16_snpTmp to chr18_snpTmp' panTro1snp125 hgsql -e 'rename table chr15_snpTmp to chr16_snpTmp' panTro1snp125 hgsql -e 'rename table chr14_snpTmp to chr15_snpTmp' panTro1snp125 hgsql -e 'rename table chr13_snpTmp to chr14_snpTmp' panTro1snp125 hgsql -e 'rename table chr2B_snpTmp to chr13_snpTmp' panTro1snp125 # get nib files ssh hgwdev cd /cluster/data/panTro1/nib cp *.nib /cluster/home/heather/transfer/snp/panTro1 ssh kkr5u00 cd /scratch/snp/125/chimp mkdir nib cp /cluster/home/heather/transfer/snp/panTro1/*.nib nib rm /cluster/home/heather/transfer/snp/panTro1/*.nib # edit path in chromInfo, load into panTro1snp125 with editted path # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected # check for coords that are too large, log to snpRefUCSC.error and skip # This run no errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC panTro1snp125 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 6800 RefAlleleNotRevComp /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles panTro1snp125 # add class and observed using univar_id from SNP table # to get class (subsnp_class) and observed (var_str) from UniVariation # log errors to snpClassAndObserved.errors # errors detected: # class = 0 in UniVariation # class > 8 in UniVariation # univar_id = 0 in SNP # no row in SNP for snp_id in chrN_snpTmp # This run we got: # 3 class = 0 in UniVariation # 0 class > 8 in UniVariation # 0 univar_id = 0 in SNP # 1 no row in SNP for snp_id in chrN_snpTmp # dbSNP has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 # actually for this run all class = 'single' /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved panTro1snp125 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedWrongFormat # ObservedWrongSize # ObservedMismatch # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele # None of these this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved panTro1snp125 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction panTro1snp125 # add validation status and heterozygosity # log error if validation status > 31 or missing # no errors this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP panTro1snp125 # add molType # errors detected: missing or duplicate molType # this run 4557 duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype panTro1snp125 # generate chrN_snp125 and snp125Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable panTro1snp125 125 # concat into snp125.tab # cat chr*_snp125.tab >> snp125.tab /bin/sh concat.sh # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple panTro1snp125 125 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions; # load on hgwdev cp snp125.tab /cluster/home/heather/transfer/snp/panTro1 hgsql panTro1snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/panTro1/snp125Exceptions.tab ssh hgwdev cd transfer/snp/panTro1 mysql> load data local infile 'snp125.tab' into table snp125; mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; # create indexes mysql> alter table snp125 add index name (name); mysql> alter table snp125 add index chrom (chrom, bin); mysql> alter table snp125Exceptions add index name(name); # create snp125ExceptionDesc table cd /cluster/data/dbSNP hgsql panTro1 < snp125ExceptionDesc.sql # add counts to exception.panTro1.125, can start with exception.template hgsql -e 'select count(*), exception from snp125Exceptions group by exception' panTro1 mysql> load data local infile 'exception.panTro1.125' into table snp125ExceptionDesc; mysql> select count(*), exception from snp125Exceptions group by exception; +----------+---------------------------+ | count(*) | exception | +----------+---------------------------+ | 24785 | MultipleAlignments | | 675 | ObservedMismatch | | 6800 | RefAlleleNotRevComp | | 4 | RefAlleleWrongSize | | 2532 | SingleClassBetweenLocType | | 3 | SingleClassQuadAllelic | | 1469 | SingleClassRangeLocType | | 61 | SingleClassTriAllelic | +----------+---------------------------+ ####################################################################### # xenoRefGene (2010-04-10 markd) # removed special-casing of Human RefSeqs # delete old data files # update genbank.conf and realign: ssh genbank cd /cluster/genbank/genbank ./bin/gbAlignStep -initial panTro1 ssh hgwdev cd /cluster/genbank/genbank ./bin/gbDbLoadStep -drop -initialLoad panTro1 ############################################################################