# for emacs: -*- mode: sh; -*- # Apis mellifera -- # # Baylor HGSC's May 5, 2005 (Amel_3.0) assembly # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/ # # DOWNLOAD SEQUENCE (DONE 12/12/05 Andy) df -h /cluster/store* #Filesystem Size Used Avail Use% Mounted on #kkstore01-10:/export/cluster/store9 # 1.3T 697G 546G 57% /cluster/store9 # store9, whatever mkdir -p /cluster/store9/apiMel3/downloads ln -s /cluster/store9/apiMel3 /cluster/data/apiMel3 ln -s /cluster/data/apiMel3 ~/apiMel3 cd apiMel3/downloads/ wget -N ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/contigs/All_Groups_20050501.agp cut -f1-9 All_Groups_20050501.agp > tmp.agp agpCondense tmp.agp apiMel3.agp rm tmp.agp for i in `seq 1 16` Un; do f=Group${i}_20050501.fa.gz wget -N ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050501-freeze/contigs/${f} zcat $f >> contigs.fa done agpAllToFaFile apiMel3.agp contigs.fa apiMel3.fa rm contigs.fa # CREATING DATABASE (DONE 12/13/2005 Andy) ssh hgwdev hgsql '' -e 'create database apiMel3' # LOAD GAP & GOLD TABLES FROM AGP (DONE 12/13/2005 Andy) ssh hgwdev cd /cluster/data/apiMel3/downloads hgGoldGapGl -noGl apiMel3 apiMel3.agp hgsql apiMel3 -e 'analyze table gold; analyze table gap;' # REPEATMASK (DONE 12/13/2005 Andy) ssh hgwdev cd san/honeybee/ mkdir -p unmaskedSplits/splits rmsk/{run,output} faSplit -outDirDepth=1 -lift=unmaskedSplits/splits.lft gap /cluster/data/apiMel3/downloads/apiMel3.fa 500000 unmaskedSplits/honey_ cd rmsk/ ln -s ../unmaskedSplits splits cd run/ cat > rmsk.sh << "EOF" #!/bin/bash fa=`basename $1` param=-spec apis pushd /scratch oldDir=`dirs +1` cp $oldDir/$1 . /cluster/bluearc/RepeatMasker/RepeatMasker $param $fa mkdir -p $oldDir/`dirname $2` cp $fa.out $oldDir/$2 rm $fa* popd EOF chmod +x rmsk.sh cat > gsub << "EOF" #LOOP ./rmsk.sh {check in line+ $(path1)} {check out exists ../out/$(file1).out} #ENDLOOP EOF find ../splits/ -name '*.fa' > fa.lst gensub2 fa.lst single gsub spec para create spec para try para push para time Completed: 1247 of 1247 jobs CPU time in finished jobs: 809754s 13495.90m 224.93h 9.37d 0.026 y IO & Wait Time: 9761s 162.68m 2.71h 0.11d 0.000 y Average job time: 657s 10.95m 0.18h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 1847s 30.78m 0.51h 0.02d Submission to last job: 4476s 74.60m 1.24h 0.05d cd ../ head -n 3 out/honey_0177.fa.out > head find out/ -name "*.fa.out" -type f -size +80c -exec tail +4 '{}' ';' > tmp cat head tmp > tmp2 liftUp rmsk.out splits/splits.lft warn tmp2 rm head tmp tmp2 ssh hgwdev cd san/apiMel3/rmsk hgLoadOut apiMel3 rmsk.out hgsql apiMel3 -e 'rename table rmsk_rmsk to rmsk' hgsql apiMel3 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' rm splits cd ../ cp -r rmsk/ /cluster/data/apiMel3/ rm -rf rmsk/ # SIMPLE REPEATS (TRF) (DONE 10/22/2005 Andy) ssh pk cd san/apiMel3 mkdir -p simpleRepeat/{run,out} cd simpleRepeat/ ln -s ../unmaskedSplits splits cd run/ cat << "_EOF_" > gsub #LOOP ./trfBig.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDir1)/$(root1).bed} #ENDLOOP _EOF_ cat << "_EOF_" > trfBig.sh #!/bin/bash outDir=`dirname $2` outName=`basename $2` inName=`basename $1` mkdir -p $outDir cp $1 /scratch pushd /scratch trfBig $inName $outName -bed -tempDir=/tmp > /dev/null popd cp /scratch/$outName $2 rm /scratch/$outName /scratch/$inName _EOF_ chmod +x trfBig.sh find ../splits/ -name '*.fa' > fa.lst gensub2 fa.lst single gsub spec para create spec para push para time #Completed: 1246 of 1247 jobs #CPU time in finished jobs: 3355s 55.91m 0.93h 0.04d 0.000 y #IO & Wait Time: 10020s 167.00m 2.78h 0.12d 0.000 y #Average job time: 11s 0.18m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 130s 2.17m 0.04h 0.00d #Submission to last job: 179s 2.98m 0.05h 0.00d ssh hgwdev cd san/apiMel3/simpleRepeat/ find out/ -name "*.bed" -type f -exec grep "^honey" '{}' ';' > tmp liftUp trf.bed splits/splits.lft warn tmp rm tmp hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ apiMel3 simpleRepeat trf.bed rm splits cd ../ cp -r simpleRepeat/ /cluster/data/apiMel3/bed # MAKE MASKED NIBS/2BIT (DONE 10/22/2005 Andy) # make a filtered version of the trf output keep trf's with period <= 12: ssh hgwdev cd /cluster/data/apiMel3 zcat bed/simpleRepeat/trf.bed | awk '{if ($5 <= 12) print;}' > trfMask.bed maskOutFa -soft downloads/apiMel3.fa trfMask.bed tmp maskOutFa -softAdd tmp rmsk/rmsk.out apiMel3.masked.fa faToTwoBit apiMel3.masked.fa apiMel3.2bit hgsql apiMel3 < ~/kent/src/hg/lib/chromInfo.sql twoBitInfo apiMel3.2bit /dev/stdout | awk '{printf("%s\t%s\t/gbdb/apiMel3/apiMel3.2bit\n", $1, $2)}' > chrom.sizes echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql apiMel3 mkdir maskedFa nib faSplit byname apiMel3.masked.fa maskedFa/ cd maskedFa/ for f in *; do g=${f%.fa}; faToNib -softMask $f ../nib/${g}.nib; done cd ../ rm apiMel3.masked.fa tmp trfMask.bed ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 10/5/07 angie) ssh kkstore04 cd /cluster/data/apiMel3 # Put files in the usual chrom-based assembly locations: zcat bed/simpleRepeat/trf.bed.gz \ | awk '{if ($5 <= 12) print;}' \ | splitFileByColumn stdin -ending=.bed bed/simpleRepeat/trfMaskChrom splitFileByColumn -chromDirs downloads/apiMel3.agp . head -3 rmsk/rmsk.out > /tmp/rmskHeader tail +4 rmsk/rmsk.out \ | sort -k5,5 \ | splitFileByColumn -chromDirs -head=/tmp/rmskHeader -col=5 \ -ending=.fa.out stdin . mkdir jkStuff makeDownloads.pl apiMel3 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edited the README's -- instructions at the end of downloads.log. # /cluster/data/apiMel3/goldenPath/database/README.txt # /cluster/data/apiMel3/goldenPath/bigZips/README.txt # /cluster/data/apiMel3/goldenPath/chromosomes/README.txt #########################################################################