/* udc - url data cache - a caching system that keeps blocks of data fetched from URLs in * sparse local files for quick use the next time the data is needed. * * This cache is enormously simplified by there being no local _write_ to the cache, * just reads. * * The overall strategy of the implementation is to have a root cache directory * with a subdir for each file being cached. The directory for a single cached file * contains two files - "bitmap" and "sparseData" that contains information on which * parts of the URL are cached and the actual cached data respectively. The subdirectory name * associated with the file is constructed from the URL in a straightforward manner. * http://genome.ucsc.edu/cgi-bin/hgGateway * gets mapped to: * rootCacheDir/http/genome.ucsc.edu/cgi-bin/hgGateway/ * The URL protocol is the first directory under the root, and the remainder of the * URL, with some necessary escaping, is used to define the rest of the cache directory * structure, with each '/' after the protocol line translating into another directory * level. * * The bitmap file contains time stamp and size data as well as an array with one bit * for each block of the file that has been fetched. Currently the block size is 8K. */ #include #include "common.h" #include "hash.h" #include "obscure.h" #include "bits.h" #include "linefile.h" #include "portable.h" #include "sig.h" #include "net.h" #include "cheapcgi.h" #include "udc.h" #define udcBlockSize (8*1024) /* All fetch requests are rounded up to block size. */ #define udcMaxBytesPerRemoteFetch (udcBlockSize * 32) /* Very large remote reads are broken down into chunks this size. */ struct connInfo /* Socket descriptor and associated info, for keeping net connections open. */ { int socket; /* Socket descriptor for data connection (or 0). */ bits64 offset; /* Current file offset of socket. */ int ctrlSocket; /* (FTP only) Control socket descriptor or 0. */ }; typedef int (*UdcDataCallback)(char *url, bits64 offset, int size, void *buffer, struct connInfo *ci); /* Type for callback function that fetches file data. */ struct udcRemoteFileInfo /* Information about a remote file. */ { bits64 updateTime; /* Last update in seconds since 1970 */ bits64 size; /* Remote file size */ struct connInfo ci; /* Connection info for open net connection */ }; typedef boolean (*UdcInfoCallback)(char *url, struct udcRemoteFileInfo *retInfo); /* Type for callback function that fetches file timestamp and size. */ struct udcProtocol /* Something to handle a communications protocol like http, https, ftp, local file i/o, etc. */ { struct udcProtocol *next; /* Next in list */ UdcDataCallback fetchData; /* Data fetcher */ UdcInfoCallback fetchInfo; /* Timestamp & size fetcher */ }; struct udcFile /* A file handle for our caching system. */ { struct udcFile *next; /* Next in list. */ char *url; /* Name of file - includes protocol */ char *protocol; /* The URL up to the first colon. http: etc. */ struct udcProtocol *prot; /* Protocol specific data and methods. */ time_t updateTime; /* Last modified timestamp. */ bits64 size; /* Size of file. */ bits64 offset; /* Current offset in file. */ char *cacheDir; /* Directory for cached file parts. */ char *bitmapFileName; /* Name of bitmap file. */ char *sparseFileName; /* Name of sparse data file. */ int fdSparse; /* File descriptor for sparse data file. */ boolean sparseReadAhead; /* Read-ahead has something in the buffer */ char *sparseReadAheadBuf; /* Read-ahead buffer, if any */ bits64 sparseRAOffset; /* Read-ahead buffer offset */ struct udcBitmap *bits; /* udcBitMap */ bits64 startData; /* Start of area in file we know to have data. */ bits64 endData; /* End of area in file we know to have data. */ bits32 bitmapVersion; /* Version of associated bitmap we were opened with. */ struct connInfo connInfo; /* Connection info for open net connection. */ }; struct udcBitmap /* The control structure including the bitmap of blocks that are cached. */ { struct udcBitmap *next; /* Next in list. */ bits32 blockSize; /* Number of bytes per block of file. */ bits64 remoteUpdate; /* Remote last update time. */ bits64 fileSize; /* File size */ bits32 version; /* Version - increments each time cache is stale. */ bits64 localUpdate; /* Time we last fetched new data into cache. */ bits64 localAccess; /* Time we last accessed data. */ boolean isSwapped; /* If true need to swap all bytes on read. */ int fd; /* File descriptor for file with current block. */ }; static char *bitmapName = "bitmap"; static char *sparseDataName = "sparseData"; #define udcBitmapHeaderSize (64) static int cacheTimeout = 0; #define MAX_SKIP_TO_SAVE_RECONNECT (udcMaxBytesPerRemoteFetch / 2) static void readAndIgnore(int sd, bits64 size) /* Read size bytes from sd and return. */ { static char *buf = NULL; if (buf == NULL) buf = needMem(udcBlockSize); bits64 remaining = size, total = 0; while (remaining > 0) { bits64 chunkSize = min(remaining, udcBlockSize); bits64 rd = read(sd, buf, chunkSize); if (rd < 0) errnoAbort("readAndIgnore: error reading socket after %lld bytes", total); remaining -= rd; total += rd; } if (total < size) errAbort("readAndIgnore: got EOF at %lld bytes (wanted %lld)", total, size); } static int connInfoGetSocket(struct connInfo *ci, char *url, bits64 offset, int size) /* If ci has an open socket and the given offset matches ci's current offset, * reuse ci->socket. Otherwise close the socket, open a new one, and update ci, * or return -1 if there is an error opening a new one. */ { if (ci != NULL && ci->socket > 0 && ci->offset != offset) { bits64 skipSize = (offset - ci->offset); if (skipSize > 0 && skipSize <= MAX_SKIP_TO_SAVE_RECONNECT) { verbose(2, "!! skipping %lld bytes @%lld to avoid reconnect\n", skipSize, ci->offset); readAndIgnore(ci->socket, skipSize); ci->offset = offset; } else { verbose(2, "Offset mismatch (ci %lld != new %lld), reopening.\n", ci->offset, offset); mustCloseFd(&(ci->socket)); if (ci->ctrlSocket > 0) mustCloseFd(&(ci->ctrlSocket)); ZeroVar(ci); } } int sd; if (ci == NULL || ci->socket <= 0) { char rangeUrl[2048]; if (ci == NULL) { safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-%lld", url, offset, (offset + size - 1)); sd = netUrlOpen(rangeUrl); } else { safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-", url, offset); sd = ci->socket = netUrlOpenSockets(rangeUrl, &(ci->ctrlSocket)); ci->offset = offset; } if (sd < 0) return -1; if (startsWith("http", url)) { char *newUrl = NULL; int newSd = 0; if (!netSkipHttpHeaderLinesHandlingRedirect(sd, rangeUrl, &newSd, &newUrl)) return -1; if (newUrl) { freeMem(newUrl); sd = newSd; if (ci != NULL) ci->socket = newSd; } } } else sd = ci->socket; return sd; } /********* Section for local file protocol **********/ static char *assertLocalUrl(char *url) /* Make sure that url is local and return bits past the protocol. */ { if (startsWith("local:", url)) url += 6; if (url[0] != '/') errAbort("Local urls must start at /"); if (stringIn("..", url) || stringIn("~", url) || stringIn("//", url) || stringIn("/./", url) || endsWith("/.", url)) { errAbort("relative paths not allowed in local urls (%s)", url); } return url; } static int udcDataViaLocal(char *url, bits64 offset, int size, void *buffer, struct connInfo *ci) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { /* Need to check time stamp here. */ verbose(2, "reading remote data - %d bytes at %lld - on %s\n", size, offset, url); url = assertLocalUrl(url); FILE *f = mustOpen(url, "rb"); fseek(f, offset, SEEK_SET); int sizeRead = fread(buffer, 1, size, f); if (ferror(f)) { warn("udcDataViaLocal failed to fetch %d bytes at %lld", size, offset); errnoAbort("file %s", url); } carefulClose(&f); return sizeRead; } static boolean udcInfoViaLocal(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retTime with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { verbose(2, "checking remote info on %s\n", url); url = assertLocalUrl(url); struct stat status; int ret = stat(url, &status); if (ret < 0) return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for transparent file protocol **********/ static int udcDataViaTransparent(char *url, bits64 offset, int size, void *buffer, struct connInfo *ci) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { internalErr(); /* Should not get here. */ return size; } static boolean udcInfoViaTransparent(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retInfo with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { internalErr(); /* Should not get here. */ return FALSE; } /********* Section for slow local file protocol - simulates network... **********/ static int udcDataViaSlow(char *url, bits64 offset, int size, void *buffer, struct connInfo *ci) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { verbose(2, "slow reading remote data - %d bytes at %lld - on %s\n", size, offset, url); sleep1000(500); char *fileName = url + 5; /* skip over 'slow:' */ FILE *f = mustOpen(fileName, "rb"); fseek(f, offset, SEEK_SET); char *pt = buffer; int i, step=1024; int sizeRead = 0; for (i=0; i step) readChunk = step; int oneReadSize = fread(pt, 1, readChunk, f); verbose(2, "slowly read %d bytes\n", oneReadSize); if (ferror(f)) { warn("udcDataViaSlow failed to fetch %d bytes at %lld", size, offset); errnoAbort("file %s", fileName); } pt += step; sizeRead += oneReadSize; } carefulClose(&f); return sizeRead; } static boolean udcInfoViaSlow(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retTime with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { char *fileName = url + 5; /* skip over 'slow:' */ verbose(2, "slow checking remote info on %s\n", url); sleep1000(500); struct stat status; int ret = stat(fileName, &status); if (ret < 0) return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for http protocol **********/ int udcDataViaHttpOrFtp(char *url, bits64 offset, int size, void *buffer, struct connInfo *ci) /* Fetch a block of data of given size into buffer using url's protocol, * which must be http, https or ftp. Returns number of bytes actually read. * Does an errAbort on error. * Typically will be called with size in the 8k-64k range. */ { if (startsWith("http://",url) || startsWith("https://",url) || startsWith("ftp://",url)) verbose(2, "reading http/https/ftp data - %d bytes at %lld - on %s\n", size, offset, url); else errAbort("Invalid protocol in url [%s] in udcDataViaFtp, only http, https, or ftp supported", url); int sd = connInfoGetSocket(ci, url, offset, size); if (sd < 0) errAbort("Can't get data socket for %s", url); int rd = 0, total = 0, remaining = size; char *buf = (char *)buffer; while ((remaining > 0) && ((rd = read(sd, buf, remaining)) > 0)) { total += rd; buf += rd; remaining -= rd; } if (rd == -1) errnoAbort("udcDataViaHttpOrFtp: error reading socket"); if (ci == NULL) mustCloseFd(&sd); else ci->offset += total; return total; } boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of URL * and returns status of HEAD GET. */ { verbose(2, "checking http remote info on %s\n", url); struct hash *hash = newHash(0); int status = netUrlHead(url, hash); if (status != 200) // && status != 302 && status != 301) return FALSE; char *sizeString = hashFindValUpperCase(hash, "Content-Length:"); if (sizeString == NULL) { /* try to get remote file size by an alternate method */ retInfo->size = netUrlSizeByRangeResponse(url); if (retInfo->size < 0) { hashFree(&hash); errAbort("No Content-Length: returned in header for %s, can't proceed, sorry", url); } } else { retInfo->size = atoll(sizeString); } char *lastModString = hashFindValUpperCase(hash, "Last-Modified:"); if (lastModString == NULL) { // Date is a poor substitute! It will always appear that the cache is stale. // But at least we can read files from dropbox.com. lastModString = hashFindValUpperCase(hash, "Date:"); if (lastModString == NULL) { hashFree(&hash); errAbort("No Last-Modified: or Date: returned in header for %s, can't proceed, sorry", url); } } struct tm tm; time_t t; // Last-Modified: Wed, 15 Nov 1995 04:58:08 GMT // This will always be GMT if (strptime(lastModString, "%a, %d %b %Y %H:%M:%S %Z", &tm) == NULL) { /* Handle error */; hashFree(&hash); errAbort("unable to parse last-modified string [%s]", lastModString); } t = mktimeFromUtc(&tm); if (t == -1) { /* Handle error */; hashFree(&hash); errAbort("mktimeFromUtc failed while converting last-modified string [%s] from UTC time", lastModString); } retInfo->updateTime = t; hashFree(&hash); return status; } /********* Section for ftp protocol **********/ // fetchData method: See udcDataViaHttpOrFtp above. boolean udcInfoViaFtp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of FTP URL */ { verbose(2, "checking ftp remote info on %s\n", url); long long size = 0; time_t t, tUtc; struct tm *tm = NULL; // TODO: would be nice to add int *retCtrlSocket to netGetFtpInfo so we can stash // in retInfo->connInfo and keep socket open. boolean ok = netGetFtpInfo(url, &size, &tUtc); if (!ok) return FALSE; // Convert UTC to localtime tm = localtime(&tUtc); t = mktimeFromUtc(tm); if (t == -1) { /* Handle error */; errAbort("mktimeFromUtc failed while converting FTP UTC last-modified time %ld to local time", (long) tUtc); } retInfo->size = size; retInfo->updateTime = t; return TRUE; } /********* Non-protocol-specific bits **********/ boolean udcFastReadString(struct udcFile *f, char buf[256]) /* Read a string into buffer, which must be long enough * to hold it. String is in 'writeString' format. */ { UBYTE bLen; int len; if (!udcReadOne(f, bLen)) return FALSE; if ((len = bLen)> 0) udcMustRead(f, buf, len); buf[len] = 0; return TRUE; } void msbFirstWriteBits64(FILE *f, bits64 x); static char *fileNameInCacheDir(struct udcFile *file, char *fileName) /* Return the name of a file in the cache dir, from the cache root directory on down. * Do a freeMem on this when done. */ { int dirLen = strlen(file->cacheDir); int nameLen = strlen(fileName); char *path = needMem(dirLen + nameLen + 2); memcpy(path, file->cacheDir, dirLen); path[dirLen] = '/'; memcpy(path+dirLen+1, fileName, nameLen); return path; } static void udcNewCreateBitmapAndSparse(struct udcFile *file, bits64 remoteUpdate, bits64 remoteSize, bits32 version) /* Create a new bitmap file around the given remoteUpdate time. */ { int fd = mustOpenFd(file->bitmapFileName, O_WRONLY | O_CREAT | O_TRUNC); bits32 sig = udcBitmapSig; bits32 blockSize = udcBlockSize; bits64 reserved64 = 0; bits32 reserved32 = 0; int blockCount = (remoteSize + udcBlockSize - 1)/udcBlockSize; int bitmapSize = bitToByteSize(blockCount); /* Write out fixed part of header. */ writeOneFd(fd, sig); writeOneFd(fd, blockSize); writeOneFd(fd, remoteUpdate); writeOneFd(fd, remoteSize); writeOneFd(fd, version); writeOneFd(fd, reserved32); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); long long offset = mustLseek(fd, 0, SEEK_CUR); if (offset != udcBitmapHeaderSize) errAbort("offset in fd=%d, f=%s is %lld, not expected udcBitmapHeaderSize %d", fd, file->bitmapFileName, offset, udcBitmapHeaderSize); /* Write out initial all-zero bitmap, using sparse-file method: write 0 to final address. */ unsigned char zero = 0; mustLseek(fd, bitmapSize-1, SEEK_CUR); mustWriteFd(fd, &zero, 1); /* Clean up bitmap file and name. */ mustCloseFd(&fd); /* Create an empty data file. */ fd = mustOpenFd(file->sparseFileName, O_WRONLY | O_CREAT | O_TRUNC); mustCloseFd(&fd); } static struct udcBitmap *udcBitmapOpen(char *fileName) /* Open up a bitmap file and read and verify header. Return NULL if file doesn't * exist, abort on error. */ { /* Open file, returning NULL if can't. */ int fd = open(fileName, O_RDWR); if (fd < 0) { if (errno == ENOENT) return NULL; else errnoAbort("Can't open(%s, O_RDWR)", fileName); } /* Get status info from file. */ struct stat status; fstat(fd, &status); /* Read signature and decide if byte-swapping is needed. */ // TODO: maybe buffer the I/O for performance? Don't read past header - // fd offset needs to point to first data block when we return. bits32 magic; boolean isSwapped = FALSE; mustReadOneFd(fd, magic); if (magic != udcBitmapSig) { magic = byteSwap32(magic); isSwapped = TRUE; if (magic != udcBitmapSig) errAbort("%s is not a udcBitmap file", fileName); } /* Allocate bitmap object, fill it in, and return it. */ bits32 reserved32; bits64 reserved64; struct udcBitmap *bits; AllocVar(bits); bits->blockSize = fdReadBits32(fd, isSwapped); bits->remoteUpdate = fdReadBits64(fd, isSwapped); bits->fileSize = fdReadBits64(fd, isSwapped); bits->version = fdReadBits32(fd, isSwapped); reserved32 = fdReadBits32(fd, isSwapped); reserved64 = fdReadBits64(fd, isSwapped); reserved64 = fdReadBits64(fd, isSwapped); reserved64 = fdReadBits64(fd, isSwapped); reserved64 = fdReadBits64(fd, isSwapped); bits->localUpdate = status.st_mtime; bits->localAccess = status.st_atime; bits->isSwapped = isSwapped; bits->fd = fd; return bits; } static void udcBitmapClose(struct udcBitmap **pBits) /* Free up resources associated with udcBitmap. */ { struct udcBitmap *bits = *pBits; if (bits != NULL) { mustCloseFd(&(bits->fd)); freez(pBits); } } static struct udcProtocol *udcProtocolNew(char *upToColon) /* Build up a new protocol around a string such as "http" or "local" */ { struct udcProtocol *prot; AllocVar(prot); if (sameString(upToColon, "local")) { prot->fetchData = udcDataViaLocal; prot->fetchInfo = udcInfoViaLocal; } else if (sameString(upToColon, "slow")) { prot->fetchData = udcDataViaSlow; prot->fetchInfo = udcInfoViaSlow; } else if (sameString(upToColon, "http") || sameString(upToColon, "https")) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaHttp; } else if (sameString(upToColon, "ftp")) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaFtp; } else if (sameString(upToColon, "transparent")) { prot->fetchData = udcDataViaTransparent; prot->fetchInfo = udcInfoViaTransparent; } else { errAbort("Unrecognized protocol %s in udcProtNew", upToColon); } return prot; } static void udcProtocolFree(struct udcProtocol **pProt) /* Free up protocol resources. */ { freez(pProt); } static void setInitialCachedDataBounds(struct udcFile *file, boolean useCacheInfo) /* Open up bitmap file and read a little bit of it to see if cache is stale, * and if not to see if the initial part is cached. Sets the data members * startData, and endData. If the case is stale it makes fresh empty * cacheDir/sparseData and cacheDir/bitmap files. */ { bits32 version = 0; /* Get existing bitmap, and if it's stale clean up. */ struct udcBitmap *bits = udcBitmapOpen(file->bitmapFileName); if (bits != NULL) { if (useCacheInfo) { file->size = bits->fileSize; file->updateTime = bits->remoteUpdate; } version = bits->version; if (bits->remoteUpdate != file->updateTime || bits->fileSize != file->size || !fileExists(file->sparseFileName)) { verbose(2, "removing stale version (%lld! = %lld or %lld! = %lld or %s doesn't exist), " "new version %d\n", bits->remoteUpdate, (long long)file->updateTime, bits->fileSize, file->size, file->sparseFileName, version); udcBitmapClose(&bits); remove(file->bitmapFileName); remove(file->sparseFileName); ++version; } } else verbose(2, "bitmap file %s does not already exist, creating.\n", file->bitmapFileName); /* If no bitmap, then create one, and also an empty sparse data file. */ if (bits == NULL) { udcNewCreateBitmapAndSparse(file, file->updateTime, file->size, version); bits = udcBitmapOpen(file->bitmapFileName); if (bits == NULL) errAbort("Unable to open bitmap file %s", file->bitmapFileName); } file->bitmapVersion = bits->version; /* Read in a little bit from bitmap while we have it open to see if we have anything cached. */ if (file->size > 0) { Bits b; off_t wasAt = lseek(bits->fd, 0, SEEK_CUR); mustReadOneFd(bits->fd, b); int endBlock = (file->size + udcBlockSize - 1)/udcBlockSize; if (endBlock > 8) endBlock = 8; int initialCachedBlocks = bitFindClear(&b, 0, endBlock); file->endData = initialCachedBlocks * udcBlockSize; mustLseek(bits->fd, wasAt, SEEK_SET); } file->bits = bits; } static boolean qEscaped(char c) /* Returns TRUE if character needs to be escaped in q-encoding. */ { if (isalnum(c)) return c == 'Q'; else return c != '_' && c != '-' && c != '/' && c != '.'; } static char *qEncode(char *input) /* Do a simple encoding to convert input string into "normal" characters. * Abnormal letters, and '!' get converted into Q followed by two hexadecimal digits. */ { /* First go through and figure out encoded size. */ int size = 0; char *s, *d, c; s = input; while ((c = *s++) != 0) { if (qEscaped(c)) size += 3; else size += 1; } /* Allocate and fill in output. */ char *output = needMem(size+1); s = input; d = output; while ((c = *s++) != 0) { if (qEscaped(c)) { sprintf(d, "Q%02X", (unsigned)c); d += 3; } else *d++ = c; } return output; } void udcParseUrlFull(char *url, char **retProtocol, char **retAfterProtocol, char **retColon, char **retAuth) /* Parse the URL into components that udc treats separately. * *retAfterProtocol is Q-encoded to keep special chars out of filenames. * Free all *ret's except *retColon when done. */ { char *protocol, *afterProtocol; char *colon = strchr(url, ':'); if (!colon) { *retColon = NULL; return; } int colonPos = colon - url; protocol = cloneStringZ(url, colonPos); afterProtocol = url + colonPos + 1; while (afterProtocol[0] == '/') afterProtocol += 1; char *userPwd = strchr(afterProtocol, '@'); if (userPwd) { if (retAuth) { char auth[1024]; safencpy(auth, sizeof(auth), afterProtocol, userPwd+1-afterProtocol); *retAuth = qEncode(auth); } char *afterHost = strchr(afterProtocol, '/'); if (!afterHost) { afterHost = afterProtocol+strlen(afterProtocol); } if (userPwd < afterHost) afterProtocol = userPwd + 1; } else if (retAuth) *retAuth = NULL; afterProtocol = qEncode(afterProtocol); *retProtocol = protocol; *retAfterProtocol = afterProtocol; *retColon = colon; } void udcParseUrl(char *url, char **retProtocol, char **retAfterProtocol, char **retColon) /* Parse the URL into components that udc treats separately. * *retAfterProtocol is Q-encoded to keep special chars out of filenames. * Free *retProtocol and *retAfterProtocol but not *retColon when done. */ { udcParseUrlFull(url, retProtocol, retAfterProtocol, retColon, NULL); } void udcPathAndFileNames(struct udcFile *file, char *cacheDir, char *protocol, char *afterProtocol) /* Initialize udcFile path and names */ { int len = strlen(cacheDir) + 1 + strlen(protocol) + 1 + strlen(afterProtocol) + 1; file->cacheDir = needMem(len); safef(file->cacheDir, len, "%s/%s/%s", cacheDir, protocol, afterProtocol); /* Create file names for bitmap and data portions. */ file->bitmapFileName = fileNameInCacheDir(file, bitmapName); file->sparseFileName = fileNameInCacheDir(file, sparseDataName); } static long long int udcSizeAndModTimeFromBitmap(char *bitmapFileName, time_t *retTime) /* Look up the file size from the local cache bitmap file, or -1 if there * is no cache for url. If retTime is non-null, store the remote update time in it. */ { long long int ret = -1; struct udcBitmap *bits = udcBitmapOpen(bitmapFileName); if (bits != NULL) { ret = bits->fileSize; if (retTime) *retTime = bits->remoteUpdate; } else warn("Can't open bitmap file %s: %s\n", bitmapFileName, strerror(errno)); udcBitmapClose(&bits); return ret; } struct udcFile *udcFileMayOpen(char *url, char *cacheDir) /* Open up a cached file. cacheDir may be null in which case udcDefaultDir() will be * used. Return NULL if file doesn't exist. */ { if (cacheDir == NULL) cacheDir = udcDefaultDir(); verbose(2, "udcfileOpen(%s, %s)\n", url, cacheDir); /* Parse out protocol. Make it "transparent" if none specified. */ char *protocol = NULL, *afterProtocol = NULL, *colon; boolean isTransparent = FALSE; udcParseUrl(url, &protocol, &afterProtocol, &colon); if (!colon) { freeMem(protocol); protocol = cloneString("transparent"); freeMem(afterProtocol); afterProtocol = cloneString(url); isTransparent = TRUE; } struct udcProtocol *prot; prot = udcProtocolNew(protocol); /* Figure out if anything exists. */ boolean useCacheInfo = FALSE; struct udcRemoteFileInfo info; ZeroVar(&info); if (!isTransparent) { useCacheInfo = (udcCacheAge(url, cacheDir) < udcCacheTimeout()); if (!useCacheInfo) { if (!prot->fetchInfo(url, &info)) { udcProtocolFree(&prot); freeMem(protocol); freeMem(afterProtocol); return NULL; } } } /* Allocate file object and start filling it in. */ struct udcFile *file; AllocVar(file); file->url = cloneString(url); file->protocol = protocol; file->prot = prot; if (isTransparent) { /* If transparent dummy up things so that the "sparse" file pointer is actually * the file itself, which appears to be completely loaded in cache. */ int fd = file->fdSparse = mustOpenFd(url, O_RDONLY); struct stat status; fstat(fd, &status); file->startData = 0; file->endData = file->size = status.st_size; } else { udcPathAndFileNames(file, cacheDir, protocol, afterProtocol); if (!useCacheInfo) { file->updateTime = info.updateTime; file->size = info.size; memcpy(&(file->connInfo), &(info.ci), sizeof(struct connInfo)); // update cache file mod times, so if we're caching we won't do this again // until the timeout has expired again: if (udcCacheTimeout() > 0 && fileExists(file->bitmapFileName)) (void)maybeTouchFile(file->bitmapFileName); } /* Make directory. */ makeDirsOnPath(file->cacheDir); /* Figure out a little bit about the extent of the good cached data if any. Open bits bitmap. */ setInitialCachedDataBounds(file, useCacheInfo); file->fdSparse = mustOpenFd(file->sparseFileName, O_RDWR); } freeMem(afterProtocol); return file; } struct udcFile *udcFileOpen(char *url, char *cacheDir) /* Open up a cached file. cacheDir may be null in which case udcDefaultDir() will be * used. Abort if if file doesn't exist. */ { struct udcFile *udcFile = udcFileMayOpen(url, cacheDir); if (udcFile == NULL) errAbort("Couldn't open %s", url); return udcFile; } struct slName *udcFileCacheFiles(char *url, char *cacheDir) /* Return low-level list of files used in cache. */ { char *protocol, *afterProtocol, *colon; struct udcFile *file; udcParseUrl(url, &protocol, &afterProtocol, &colon); if (colon == NULL) return NULL; AllocVar(file); udcPathAndFileNames(file, cacheDir, protocol, afterProtocol); struct slName *list = NULL; slAddHead(&list, slNameNew(file->bitmapFileName)); slAddHead(&list, slNameNew(file->sparseFileName)); slReverse(&list); freeMem(file->cacheDir); freeMem(file->bitmapFileName); freeMem(file->sparseFileName); freeMem(file); freeMem(protocol); freeMem(afterProtocol); return list; } void udcFileClose(struct udcFile **pFile) /* Close down cached file. */ { struct udcFile *file = *pFile; if (file != NULL) { if (file->connInfo.socket != 0) mustCloseFd(&(file->connInfo.socket)); if (file->connInfo.ctrlSocket != 0) mustCloseFd(&(file->connInfo.ctrlSocket)); freeMem(file->url); freeMem(file->protocol); udcProtocolFree(&file->prot); freeMem(file->cacheDir); freeMem(file->bitmapFileName); freeMem(file->sparseFileName); freeMem(file->sparseReadAheadBuf); mustCloseFd(&(file->fdSparse)); udcBitmapClose(&file->bits); } freez(pFile); } static void qDecode(const char *input, char *buf, size_t size) /* Reverse the qEncode performed on afterProcotol above into buf or abort. */ { safecpy(buf, size, input); char c, *r = buf, *w = buf; while ((c = *r++) != '\0') { if (c == 'Q') { int q; if (sscanf(r, "%02X", &q)) { *w++ = (char)q; r += 2; } else errAbort("qDecode: input \"%s\" does not appear to be properly formatted " "starting at \"%s\"", input, r); } else *w++ = c; } *w = '\0'; } char *udcPathToUrl(const char *path, char *buf, size_t size, char *cacheDir) /* Translate path into an URL, store in buf, return pointer to buf if successful * and NULL if not. */ { if (cacheDir == NULL) cacheDir = udcDefaultDir(); int offset = 0; if (startsWith(cacheDir, (char *)path)) offset = strlen(cacheDir); if (path[offset] == '/') offset++; char protocol[16]; strncpy(protocol, path+offset, sizeof(protocol)); protocol[ArraySize(protocol)-1] = '\0'; char *p = strchr(protocol, '/'); if (p == NULL) { errAbort("unable to parse protocol (first non-'%s' directory) out of path '%s'\n", cacheDir, path); return NULL; } *p++ = '\0'; char afterProtocol[4096]; qDecode(path+1+strlen(protocol)+1, afterProtocol, sizeof(afterProtocol)); safef(buf, size, "%s://%s", protocol, afterProtocol); return buf; } long long int udcSizeFromCache(char *url, char *cacheDir) /* Look up the file size from the local cache bitmap file, or -1 if there * is no cache for url. */ { long long int ret = -1; if (cacheDir == NULL) cacheDir = udcDefaultDir(); struct slName *sl, *slList = udcFileCacheFiles(url, cacheDir); for (sl = slList; sl != NULL; sl = sl->next) if (endsWith(sl->name, bitmapName)) { ret = udcSizeAndModTimeFromBitmap(sl->name, NULL); break; } slNameFreeList(&slList); return ret; } unsigned long udcCacheAge(char *url, char *cacheDir) /* Return the age in seconds of the oldest cache file. If a cache file is * missing, return the current time (seconds since the epoch). */ { unsigned long now = clock1(), oldestTime = now; if (cacheDir == NULL) cacheDir = udcDefaultDir(); struct slName *sl, *slList = udcFileCacheFiles(url, cacheDir); if (slList == NULL) return now; for (sl = slList; sl != NULL; sl = sl->next) if (endsWith(sl->name, bitmapName)) { if (fileExists(sl->name)) oldestTime = min(fileModTime(sl->name), oldestTime); else return now; } return (now - oldestTime); } static void readBitsIntoBuf(int fd, int headerSize, int bitStart, int bitEnd, Bits **retBits, int *retPartOffset) /* Do some bit-to-byte offset conversions and read in all the bytes that * have information in the bits we're interested in. */ { int byteStart = bitStart/8; int byteEnd = bitToByteSize(bitEnd); int byteSize = byteEnd - byteStart; Bits *bits = needLargeMem(byteSize); mustLseek(fd, headerSize + byteStart, SEEK_SET); mustReadFd(fd, bits, byteSize); *retBits = bits; *retPartOffset = byteStart*8; } static boolean allBitsSetInFile(int bitStart, int bitEnd, int partOffset, Bits *bits) /* Return TRUE if all bits in file between start and end are set. */ { int partBitStart = bitStart - partOffset; int partBitEnd = bitEnd - partOffset; int nextClearBit = bitFindClear(bits, partBitStart, partBitEnd); boolean allSet = (nextClearBit >= partBitEnd); return allSet; } // For tests/udcTest.c debugging: not declared in udc.h, but not static either: boolean udcCheckCacheBits(struct udcFile *file, int startBlock, int endBlock) /* Warn and return TRUE if any bit in (startBlock,endBlock] is not set. */ { boolean gotUnset = FALSE; struct udcBitmap *bitmap = udcBitmapOpen(file->bitmapFileName); int partOffset; Bits *bits; readBitsIntoBuf(bitmap->fd, udcBitmapHeaderSize, startBlock, endBlock, &bits, &partOffset); int partBitStart = startBlock - partOffset; int partBitEnd = endBlock - partOffset; int nextClearBit = bitFindClear(bits, partBitStart, partBitEnd); while (nextClearBit < partBitEnd) { int clearBlock = nextClearBit + partOffset; warn("... udcFile 0x%04lx: bit for block %d (%lld..%lld] is not set", (unsigned long)file, clearBlock, ((long long)clearBlock * udcBlockSize), (((long long)clearBlock+1) * udcBlockSize)); gotUnset = TRUE; int nextSetBit = bitFindSet(bits, nextClearBit, partBitEnd); nextClearBit = bitFindClear(bits, nextSetBit, partBitEnd); } return gotUnset; } static void fetchMissingBlocks(struct udcFile *file, struct udcBitmap *bits, int startBlock, int blockCount, int blockSize) /* Fetch missing blocks from remote and put them into file. errAbort if trouble. */ { bits64 startPos = (bits64)startBlock * blockSize; bits64 endPos = startPos + (bits64)blockCount * blockSize; if (endPos > file->size) endPos = file->size; if (endPos > startPos) { bits64 readSize = endPos - startPos; void *buf = needLargeMem(readSize); int actualSize = file->prot->fetchData(file->url, startPos, readSize, buf, &(file->connInfo)); if (actualSize != readSize) errAbort("unable to fetch %lld bytes from %s @%lld (got %d bytes)", readSize, file->url, startPos, actualSize); mustLseek(file->fdSparse, startPos, SEEK_SET); mustWriteFd(file->fdSparse, buf, readSize); freez(&buf); } } static boolean fetchMissingBits(struct udcFile *file, struct udcBitmap *bits, bits64 start, bits64 end, bits64 *retFetchedStart, bits64 *retFetchedEnd) /* Scan through relevant parts of bitmap, fetching blocks we don't already have. */ { /* Fetch relevant part of bitmap into memory */ int partOffset; Bits *b; int startBlock = start / bits->blockSize; int endBlock = (end + bits->blockSize - 1) / bits->blockSize; readBitsIntoBuf(bits->fd, udcBitmapHeaderSize, startBlock, endBlock, &b, &partOffset); if (allBitsSetInFile(startBlock, endBlock, partOffset, b)) { // it is already in the cache freeMem(b); return TRUE; } /* Loop around first skipping set bits, then fetching clear bits. */ boolean dirty = FALSE; int s = startBlock - partOffset; int e = endBlock - partOffset; for (;;) { int nextClearBit = bitFindClear(b, s, e); if (nextClearBit >= e) break; int nextSetBit = bitFindSet(b, nextClearBit, e); int clearSize = nextSetBit - nextClearBit; fetchMissingBlocks(file, bits, nextClearBit + partOffset, clearSize, bits->blockSize); bitSetRange(b, nextClearBit, clearSize); dirty = TRUE; if (nextSetBit >= e) break; s = nextSetBit; } if (dirty) { /* Update bitmap on disk.... */ int byteStart = startBlock/8; int byteEnd = bitToByteSize(endBlock); int byteSize = byteEnd - byteStart; mustLseek(bits->fd, byteStart + udcBitmapHeaderSize, SEEK_SET); mustWriteFd(bits->fd, b, byteSize); } freeMem(b); *retFetchedStart = startBlock * bits->blockSize; *retFetchedEnd = endBlock * bits->blockSize; return FALSE; } static boolean rangeIntersectOrTouch64(bits64 start1, bits64 end1, bits64 start2, bits64 end2) /* Return true if two 64-bit ranges intersect or touch. */ { // cannot use the version of this function that is in common.c since it only handles integers. bits64 s = max(start1,start2); bits64 e = min(end1,end2); return e >= s; } static void udcFetchMissing(struct udcFile *file, struct udcBitmap *bits, bits64 start, bits64 end) /* Fetch missing pieces of data from file */ { /* Call lower level routine fetch remote data that is not already here. */ bits64 fetchedStart, fetchedEnd; if (fetchMissingBits(file, bits, start, end, &fetchedStart, &fetchedEnd)) return; /* Update file startData/endData members to include new data (and old as well if * the new data overlaps the old). */ if (rangeIntersectOrTouch64(file->startData, file->endData, fetchedStart, fetchedEnd)) { if (fetchedStart > file->startData) fetchedStart = file->startData; if (fetchedEnd < file->endData) fetchedEnd = file->endData; } file->startData = fetchedStart; file->endData = fetchedEnd; } static boolean udcCachePreload(struct udcFile *file, bits64 offset, bits64 size) /* Make sure that given data is in cache - fetching it remotely if need be. * Return TRUE on success. */ { boolean ok = TRUE; /* We'll break this operation into blocks of a reasonable size to allow * other processes to get cache access, since we have to lock the cache files. */ bits64 s,e, endPos=offset+size; for (s = offset; s < endPos; s = e) { /* Figure out bounds of this section. */ e = s + udcMaxBytesPerRemoteFetch; if (e > endPos) e = endPos; struct udcBitmap *bits = file->bits; if (bits->version == file->bitmapVersion) { udcFetchMissing(file, bits, s, e); } else { ok = FALSE; verbose(2, "udcCachePreload version check failed %d vs %d", bits->version, file->bitmapVersion); } if (!ok) break; } return ok; } #define READAHEADBUFSIZE 4096 bits64 udcRead(struct udcFile *file, void *buf, bits64 size) /* Read a block from file. Return amount actually read. */ { /* Figure out region of file we're going to read, and clip it against file size. */ bits64 start = file->offset; if (start > file->size) return 0; bits64 end = start + size; if (end > file->size) end = file->size; size = end - start; char *cbuf = buf; /* use read-ahead buffer if present */ bits64 bytesRead = 0; bits64 raStart; bits64 raEnd; while(TRUE) { if (file->sparseReadAhead) { raStart = file->sparseRAOffset; raEnd = raStart+READAHEADBUFSIZE; if (start >= raStart && start < raEnd) { // copy bytes out of rabuf bits64 endInBuf = min(raEnd, end); bits64 sizeInBuf = endInBuf - start; memcpy(cbuf, file->sparseReadAheadBuf + (start-raStart), sizeInBuf); cbuf += sizeInBuf; bytesRead += sizeInBuf; start = raEnd; size -= sizeInBuf; file->offset += sizeInBuf; if (size == 0) break; } file->sparseReadAhead = FALSE; mustLseek(file->fdSparse, start, SEEK_SET); } bits64 saveEnd = end; if (size < READAHEADBUFSIZE) { file->sparseReadAhead = TRUE; if (!file->sparseReadAheadBuf) file->sparseReadAheadBuf = needMem(READAHEADBUFSIZE); file->sparseRAOffset = start; size = READAHEADBUFSIZE; end = start + size; if (end > file->size) { end = file->size; size = end - start; } } /* If we're outside of the window of file we already know is good, then have to * consult cache on disk, and maybe even fetch data remotely! */ if (start < file->startData || end > file->endData) { if (!udcCachePreload(file, start, size)) { verbose(2, "udcCachePreload failed"); bytesRead = 0; break; } /* Currently only need fseek here. Would be safer, but possibly * slower to move fseek so it is always executed in front of read, in * case other code is moving around file pointer. */ mustLseek(file->fdSparse, start, SEEK_SET); } if (file->sparseReadAhead) { mustReadFd(file->fdSparse, file->sparseReadAheadBuf, size); end = saveEnd; size = end - start; } else { mustReadFd(file->fdSparse, cbuf, size); file->offset += size; bytesRead += size; break; } } return bytesRead; } void udcMustRead(struct udcFile *file, void *buf, bits64 size) /* Read a block from file. Abort if any problem, including EOF before size is read. */ { bits64 sizeRead = udcRead(file, buf, size); if (sizeRead < size) errAbort("udc couldn't read %llu bytes from %s, did read %llu", size, file->url, sizeRead); } int udcGetChar(struct udcFile *file) /* Get next character from file or die trying. */ { UBYTE b; udcMustRead(file, &b, 1); return b; } bits64 udcReadBits64(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 64 bit entity. */ { bits64 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap64(val); return val; } bits32 udcReadBits32(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 32 bit entity. */ { bits32 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap32(val); return val; } bits16 udcReadBits16(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 16 bit entity. */ { bits16 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap16(val); return val; } float udcReadFloat(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap floating point number. */ { float val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwapFloat(val); return val; } double udcReadDouble(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap double-precision floating point number. */ { double val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwapDouble(val); return val; } char *udcReadStringAndZero(struct udcFile *file) /* Read in zero terminated string from file. Do a freeMem of result when done. */ { char shortBuf[2], *longBuf = NULL, *buf = shortBuf; int i, bufSize = sizeof(shortBuf); for (i=0; ; ++i) { /* See if need to expand buffer, which is initially on stack, but if it gets big goes into * heap. */ if (i >= bufSize) { int newBufSize = bufSize*2; char *newBuf = needLargeMem(newBufSize); memcpy(newBuf, buf, bufSize); freeMem(longBuf); buf = longBuf = newBuf; bufSize = newBufSize; } char c = udcGetChar(file); buf[i] = c; if (c == 0) break; } char *retString = cloneString(buf); freeMem(longBuf); return retString; } char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize) /* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir() * will be used. If maxSize is non-zero, check size against maxSize * and abort if it's bigger. Returns file data (with an extra terminal for the * common case where it's treated as a C string). If retSize is non-NULL then * returns size of file in *retSize. Do a freeMem or freez of the returned buffer * when done. */ { struct udcFile *file = udcFileOpen(url, cacheDir); size_t size = file->size; if (maxSize != 0 && size > maxSize) errAbort("%s is %lld bytes, but maxSize to udcFileReadAll is %lld", url, (long long)size, (long long)maxSize); char *buf = needLargeMem(size+1); udcMustRead(file, buf, size); buf[size] = 0; // add trailing zero for string processing udcFileClose(&file); if (retSize != NULL) *retSize = size; return buf; } struct lineFile *udcWrapShortLineFile(char *url, char *cacheDir, size_t maxSize) /* Read in entire short (up to maxSize) url into memory and wrap a line file around it. * The cacheDir may be null in which case udcDefaultDir() will be used. If maxSize * is zero then a default value (currently 64 meg) will be used. */ { if (maxSize == 0) maxSize = 64 * 1024 * 1024; char *buf = udcFileReadAll(url, cacheDir, maxSize, NULL); return lineFileOnString(url, TRUE, buf); } void udcSeekCur(struct udcFile *file, bits64 offset) /* Seek to a particular position in file. */ { file->offset += offset; mustLseek(file->fdSparse, offset, SEEK_CUR); } void udcSeek(struct udcFile *file, bits64 offset) /* Seek to a particular position in file. */ { file->offset = offset; mustLseek(file->fdSparse, offset, SEEK_SET); } bits64 udcTell(struct udcFile *file) /* Return current file position. */ { return file->offset; } static long bitRealDataSize(char *fileName) /* Return number of real bytes indicated by bitmaps */ { struct udcBitmap *bits = udcBitmapOpen(fileName); int blockSize = bits->blockSize; long byteSize = 0; int blockCount = (bits->fileSize + blockSize - 1)/blockSize; if (blockCount > 0) { int bitmapSize = bitToByteSize(blockCount); Bits *b = needLargeMem(bitmapSize); mustReadFd(bits->fd, b, bitmapSize); int bitsSet = bitCountRange(b, 0, blockCount); byteSize = (long)bitsSet*blockSize; freez(&b); } udcBitmapClose(&bits); return byteSize; } static bits64 rCleanup(time_t deleteTime, boolean testOnly) /* Delete any bitmap or sparseData files last accessed before deleteTime */ { struct fileInfo *file, *fileList = listDirX(".", "*", FALSE); bits64 results = 0; for (file = fileList; file != NULL; file = file->next) { if (file->isDir) { setCurrentDir(file->name); bits64 oneResult = rCleanup(deleteTime, testOnly); setCurrentDir(".."); if (oneResult > 0) { if (!testOnly) remove(file->name); results += oneResult; results += file->size; } } else if (sameString(file->name, bitmapName)) { if (file->size > udcBitmapHeaderSize) /* prevent failure on bitmap files of size 0 or less than header size */ verbose(2, "%ld (%ld) %s/%s\n", bitRealDataSize(file->name), (long)file->size, getCurrentDir(), file->name); if (file->lastAccess < deleteTime) { /* Remove all files when get bitmap, so that can ensure they are deleted in * right order. */ results += file->size; if (!testOnly) { remove(bitmapName); remove(sparseDataName); } } } else if (sameString(file->name, sparseDataName)) { if (results > 0) results += file->size; } } return results; } bits64 udcCleanup(char *cacheDir, double maxDays, boolean testOnly) /* Remove cached files older than maxDays old. If testOnly is set * no clean up is done, but the size of the files that would be * cleaned up is still. */ { time_t maxSeconds = maxDays * 24 * 60 * 60; char *curPath = cloneString(getCurrentDir()); setCurrentDir(cacheDir); time_t deleteTime = time(NULL) - maxSeconds; bits64 result = rCleanup(deleteTime, testOnly); setCurrentDir(curPath); return result; } static char *defaultDir = "/tmp/udcCache"; char *udcDefaultDir() /* Get default directory for cache */ { return defaultDir; } void udcSetDefaultDir(char *path) /* Set default directory for cache */ { defaultDir = path; } int udcCacheTimeout() /* Get cache timeout (if local cache files are newer than this many seconds, * we won't ping the remote server to check the file size and update time). */ { return cacheTimeout; } void udcSetCacheTimeout(int timeout) /* Set cache timeout (if local cache files are newer than this many seconds, * we won't ping the remote server to check the file size and update time). */ { cacheTimeout = timeout; } time_t udcUpdateTime(struct udcFile *udc) /* return udc->updateTime */ { if (sameString("transparent", udc->protocol)) { struct stat status; int ret = stat(udc->url, &status); if (ret < 0) return 0; else return status.st_mtime; } return udc->updateTime; } #ifdef PROGRESS_METER off_t remoteFileSize(char *url) /* fetch remote file size from given URL */ { off_t answer = 0; struct udcRemoteFileInfo info; if (startsWith("http://",url) || startsWith("https://",url)) { if (udcInfoViaHttp(url, &info)) answer = info.size; } else if (startsWith("ftp://",url)) { if (udcInfoViaFtp(url, &info)) answer = info.size; } return answer; } #endif