# for emacs: -*- mode: sh; -*- # SNP geographic data from the Human Genome Diversity Project -- # ancestral allele frequencies for 53 populations worldwide. # I'm adapting scripts by John Novembre, jnovembre at ucla, # and Joe Pickrell, pickrell at uchicago. ########################################################################### # (2/3/09 - 2/5/09 angie) mkdir /hive/data/outside/hgdpGeo cd /hive/data/outside/hgdpGeo # Saved files emailed from Joe Pickrell, pickrell at uchicago: # HGDP_HapMap.coords.txt, HGDP_HapMap.coords_adj.txt, HGDP_supp.pdf, # illumina_snps.gz and plotFreqs.sh. Download allele frequency files: set i = 1 while ($i <= 22) wget --timestamping \ http://hgdp.uchicago.edu/data/Alfreqs/chr$i.popfreqs.ancfrq.strat.gz set i = `expr $i + 1` end wget --timestamping -O chrX.popfreqs.ancfrq.strat.gz \ http://hgdp.uchicago.edu/data/Alfreqs/chr23.popfreqs.ancfreq.strat.gz # Broken down by chroms... confirm that rsID sections are unique: zcat chr*.gz | cut -f 2 | uniq > rsIDs.lst wc -l rsIDs.lst #657000 rsIDs.lst sort -u rsIDs.lst | wc -l #657000 # The rsIDs in illumina_snps.gz were directly genotyped -- other rsIDs # were imputed from HapMap data (not as high-confidence). How many # of the SNPs were imputed? zcat illumina_snps.gz | grep -vFwf - rsIDs.lst | wc -l #0 # So there are no imputed SNPs in the chr*.strat files! -- confirmed w/Joe. # How many populations, on average, does each rsID have data for? zcat chr*.gz | wc -l #34821000 calc 34821000 / 657000 #34821000 / 657000 = 53.000000 # Wow, that is an awfully round number. Of the 59 populations for # which we have coords, how many appear in the frequency files? zcat chr*.gz | cut -f 3 | sort -u > popsWithData.txt wc -l popsWithData.txt #53 popsWithData.txt # OK, we can assume exactly 53. Which populations don't we have data for? grep -vFwf popsWithData.txt HGDP_HapMap.coords_adj.txt #Population Latitude[DegreesNorth] Longitude[DegreesEast] #Herero -22 19 #Ovambo -19 18 #Pedi -29 30 #Sotho -29 29 #Tswana -28 24 #Zulu -28 31 # Generate a perl hash of populations w/data => alphabetical index # for storing frequencies in an array of floats: grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \ | perl -wpe 'chomp; @w=split; $index = ($. - 1); \ $_ = "\t \"$w[0]\" => $index,\n"; \ s/\t \"Adygei/\%pops = (\"Adygei/; s/(Yoruba.*),/$1 );/;' \ > popHash.pl # Generate a C array of structs that encode population name, lat and long # for plotting: grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \ | perl -wpe 'chomp; @w=split; \ $_ = " { \"$w[0]\", $w[1], $w[2] },\n"; \ s/( { \"Adygei)/struct hgdpPopInfo pops[] =\n{\n$1/; s/(Yoruba.*),/$1\n};/;' \ > popInfo.c # Now distill the frequency data into a tab-sep text file sorted by rsId. # This can be joined with snp12X.txt (sorted by rsId) to prepend chrom, # chromStart and chromEnd to make it a track table. zcat chr*.gz \ | perl -we 'use vars "%pops"; eval `cat popHash.pl`; \ while (<>) { \ (undef, $rsId, $pop, $anc, $der, $freq, undef, undef) = split; \ if (! defined $lastRs || $lastRs ne $rsId) { \ if (defined $lastRs) { \ die "Wrong number of freqs for $rsId: " . scalar(@freqs) if (@freqs != 53); \ $freqs = join(",", @freqs) . ","; \ print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n"; \ } \ ($lastRs, $lastAnc, $lastDer, @freqs) = ($rsId, $anc, $der, ()); \ } \ $index = $pops{$pop}; \ die "No index for pop $pop" unless (defined $index); \ $freqs[$index] = sprintf("%.4f", $freq); \ $freqs[$index] =~ s/([0-9])0+$/$1/; \ } \ $freqs = join(",", @freqs) . ","; \ print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n";' \ | sort \ > hgdpGeoCoordless.txt # Took ~5min. wc -l hgdpGeoCoordless.txt #657000 hgdpGeoCoordless.txt # plotFreqs.sh requires Generic Mapping Tools -- installed in # /hive/data/outside/GMT4.3.1 using a GMTparam.txt generated by # the install form on http://gmt.soest.hawaii.edu/ . # Add /hive/data/outside/GMT4.3.1/bin to PATH # Add /hive/data/outside/GMT4.3.1/man to MANPATH # I split plotFreqs.sh into two parts: plotMapBackground.sh, to be # run once, and plotFreqsOnly.sh, to be run on each rsID. ./plotMapBackground.sh foreach rsID (rs5939319 rs5982868 rs11152550) zcat chrX.popfreqs.ancfrq.strat.gz | head -3000 | grep $rsID > $rsID time plotFreqs_UCSC.sh $rsID end # plotFreqsOnly is still kind of slow. Instead of running the ps-generating # commands every time, run once and use as templates for C fprintf: set RESETPOS = "-X-7.25i -Y-4.75i" echo "0.25 6.25 14 0 1 LT SNP: %s" \ | pstext -R0/9/0/6.5 -Jx1i $RESETPOS -O -K \ | sed -e 's/^/"/; s/$/\\n"/;' > rs.template echo "0.25 6.00 14 0 1 LT Ancestral Allele: %c" \ | pstext -R0/9/0/6.5 -Jx1i -O -K -Gblue \ | sed -e 's/^/"/; s/$/\\n"/;' > a1.template echo "0.25 5.75 14 0 1 LT Derived Allele: %c" \ | pstext -R0/9/0/6.5 -Jx1i -O -K -Gorange \ | sed -e 's/^/"/; s/$/\\n"/;' > a2.template # I was not up to reverse-engineering the geographic projection # used in psxy so I think we will have to just run that in hgc.c, oh # well. ########################################################################### # Make population key (done 3/17/09 angie) # Made script plotKey.sh that plots the map background (no lines), # small circles at all of the un-tweaked locations in # HGDP_HapMap.coords.txt, and population labels. ###########################################################################