#!/bin/csh -f # set emacs mode exit; # don't actually run this like a script :) # Cat Mar 2006 Broad Assembly # DOWNLOAD SEQUENCE ssh kkstore01 mkdir /cluster/store2/felCat1 cd /cluster/data ln -s /cluster/store2/felCat1 felCat1 cd /cluster/data/felCat1 mkdir jkStuff bed mkdir broad cd broad wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.agp" wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.bases.gz" # wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.quals.gz" # wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/unplaced.fasta.gz" # wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/unplaced.qual.gz" gunzip assembly.bases.gz faSize assembly.bases # 1,642,698,377 bases (0 N's 1642698377 real 1642698377 upper 0 lower) in 817956 sequences in 1 files # Total size: mean 2008.3 sd 1931.8 min 50 (contig_713825) max 146926 (contig_534097) median 1394 # N count: mean 0.0 sd 0.0 # U count: mean 2008.3 sd 1931.8 # L count: mean 0.0 sd 0.0 /cluster/bin/x86_64/agpAllToFaFile assembly.agp assembly.bases ../felCat1.fa faSize ../felCat1.fa # 4045535322 bases (2402836945 N's 1642698377 real 1642698377 upper 0 lower) in 217790 sequences in 1 files # Total size: mean 18575.4 sd 53162.6 min 56 (scaffold_206924) max 1539409 (scaffold_217569) median 1960 # N count: mean 11032.8 sd 31089.2 # U count: mean 7542.6 sd 24907.3 # L count: mean 0.0 sd 0.0 /cluster/bin/scripts/agpToLift < assembly.agp > ../jkStuff/assembly.lft # PARTITION SCAFFOLDS FOR REPEATMASKER RUN # glom the tiny scaffolds up into ~50k collections ssh kkstore01 cd /cluster/data/felCat1 mkdir chunks50k faSplit about broad/assembly.bases 50000 chunks50k/chunk_ cd chunks50k for i in 0 1 2 3 4 5 6 7 8 9; do mkdir $i; mv *$i.fa $i; done # RUN REPEAT MASKER # make the run directory, output directory, and job list ssh kkstore01 cd /cluster/data/felCat1 tcsh cat << '_EOF_' > jkStuff/RMCat #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/felCat1/$2 /bin/cp $3 /tmp/felCat1/$2/ pushd /tmp/felCat1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -species cat $2 popd /bin/cp /tmp/felCat1/$2/$2.out ./ /bin/rm -fr /tmp/felCat1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/felCat1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/felCat1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMCat mkdir RMRun RMOut cd RMOut mkdir 0 1 2 3 4 5 6 7 8 9 cd ../chunks50k for i in */*.fa do e=`dirname $i` d="/cluster/data/felCat1" c=`basename $i` echo "../jkStuff/RMCat $d/RMOut/$e $c {check in line+ $d/chunks50k/$i} {check out line+ $d/RMOut/$e/$c.out}" done > ../RMRun/RMJobs # do the run ssh kk cd /cluster/data/felCat1/RMRun para create RMJobs para try, check, push, check,... # Completed: 31654 of 31654 jobs # CPU time in finished jobs: 16603766s 276729.43m 4612.16h 192.17d 0.527 y # IO & Wait Time: 362052s 6034.20m 100.57h 4.19d 0.011 y # Average job time: 536s 8.93m 0.15h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6222s 103.70m 1.73h 0.07d # Submission to last job: 215588s 3593.13m 59.89h 2.50d # Lift up the split-scaffold .out's to scaffold .out's ssh kkstore01 cd /cluster/data/felCat1/RMOut for i in 0 1 2 3 4 5 6 7 8 9; do echo $i; liftUp -nohead $i.out ../jkStuff/assembly.lft warn $i/*.fa.out>/dev/null; done head -3 0.out > felCat1.out for i in 0 1 2 3 4 5 6 7 8 9; do tail +4 $i.out; done >> felCat1.out # Load the .out files into the database with: ssh hgwdev hgLoadOut felCat1 /cluster/data/felCat1/RMOut/felCat1.out # Lots of strange perc messages # Loading up table felCat1_rmsk # note: 6 records dropped due to repStart > repEnd hgsql felCat1 -e 'rename table felCat1_rmsk to rmsk' # Fix up the indices too: hgsql felCat1 -e 'drop index bin on rmsk; drop index genoStart on rmsk; drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(16), bin); \ create index genoStart on rmsk (genoName(16), genoStart); \ create index genoEnd on rmsk (genoName(16), genoEnd);' # EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie) ssh kkstore01 mkdir /cluster/data/felCat1/bed/fakeAgp cd /cluster/data/felCat1/bed/fakeAgp faGapSizes ../../downloads/scaffolds.fasta -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000 # Wow, all block of N's seem to be exactly 100bp long. # hgFakeAgp's default -minContigGap of 25 will be fine. hgFakeAgp ../../downloads/scaffolds.fasta fake.agp ssh hgwdev hgLoadGap -unsplit felCat1 /cluster/data/felCat1/bed/fakeAgp/fake.agp # SIMPLE REPEATS (TRF) ssh kkstore01 mkdir /cluster/data/felCat1/bed/simpleRepeat cd /cluster/data/felCat1/bed/simpleRepeat nice trfBig -trf=/cluster/bin/i386/trf ../../felCat1.fa /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp # check on this with tail -f trf.log # Load this into the database as so ssh hgwdev hgLoadBed felCat1 simpleRepeat /cluster/data/felCat1/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/felCat1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES ssh kolossus cd /cluster/data/felCat1 /cluster/bin/x86_64/maskOutFa -soft felCat1.fa bed/simpleRepeat/trfMask.bed felCat1.simple.fa /cluster/bin/x86_64/maskOutFa -softAdd felCat1.simple.fa RMOut/felCat1.out felCat1.masked.fa mv felCat1.fa felCat1.unmasked.fa # Now clean up the unmasked split scaffolds to avoid confusion later. rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft # CREATING DATABASE # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 915G 746G 56% /var/lib/mysql hgsql '' -e 'create database felCat1' # STORE SEQUENCE AND ASSEMBLY INFORMATION # Translate to 2bit ssh kkstore01 cd /cluster/data/felCat1 /cluster/bin/x86_64/faToTwoBit felCat1.masked.fa felCat1.2bit # Make chromInfo.tab. mkdir bed/chromInfo twoBitInfo felCat1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/felCat1/felCat1.2bit\n",$1,$2;}' \ | sort > bed/chromInfo/chromInfo.tab # Make symbolic a link from /gbdb/felCat1 to the 2bit. ssh hgwdev mkdir -p /gbdb/felCat1 ln -s /cluster/data/felCat1/felCat1.2bit /gbdb/felCat1/