# File src/library/tools/R/pdftools.R
# Part of the R package, http://www.R-project.org
#
# Copyright (C) 1995-2013 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
## See PDF Reference version 1.7 chapter 3:
## At the most fundamental level, a PDF file is a sequence of 8-bit
## bytes.
## where subsequently for chapter 3
## the term character is synonymous with byte and merely refers to a
## particular 8-bit value ...
##
## The PDF character set is divided into 3 classes:
## White-space characters (table 3.1):
## 0 00 000 Null (NUL)
## 9 09 011 Tab (HT)
## 10 0A 012 Line feed (LF)
## 12 0C 014 Form feed (FF)
## 13 0D 015 Carriage return (CR)
## 32 20 040 Space (SP)
## Delimiter characters:
## ( ) < > [ ] { } / %
## All characters except white-space characters and delimiters are
## regular characters.
## * Variables
pdf_bytes_whitespaces <-
c(charToRaw("\t\n\f\r "), as.raw(0L))
pdf_bytes_delimiters <-
charToRaw("()<>[]{}/%")
pdf_bytes_non_regulars <-
c(pdf_bytes_whitespaces, pdf_bytes_delimiters)
pdf_bytes_eols <-
charToRaw("\n\r")
pdf_bytes_digits <-
charToRaw("0123456789")
pdf_bytes_in_keywords <-
charToRaw(paste0("*'\"",
"0123456789",
paste(LETTERS, collapse = ""),
paste(letters, collapse = "")))
pdf_bytes_in_numerics_not_digits <-
charToRaw("+-.")
pdf_bytes_in_numerics <-
c(pdf_bytes_digits, pdf_bytes_in_numerics_not_digits)
pdf_page_sizes <-
do.call(rbind,
list("A0" = c(2384L, 3371L),
"A1" = c(1685L, 2384L),
"A2" = c(1190L, 1684L),
"A3" = c( 842L, 1190L),
"A4" = c( 595L, 842L),
"A5" = c( 420L, 595L),
"B4" = c( 729L, 1032L),
"B5" = c( 516L, 729L),
"letter" = c( 612L, 792L),
"tabloid" = c( 792L, 1224L),
"ledger" = c(1224L, 792L),
"legal" = c( 612L, 1008L),
"statement" = c( 396L, 612L),
"executive" = c( 540L, 720L),
"folio" = c( 612L, 936L),
"quarto" = c( 610L, 780L),
"10x14" = c( 720L, 1008L)))
## * pdf_doc
pdf_doc <-
function(file, cache = TRUE)
{
if(is.character(file)) {
##
## When caching, we could read the whole PDF file into memory
## and use a raw connection to read this byte stream. But is
## there a way to get the connection closed when the doc object
## gets removed?
## if(cache) {
## .bytes <- readBin(file, "raw", file.info(file)$size)
## con <- rawConnection(.bytes)
## keep <- TRUE
## }
##
con <- file(file, "rb")
on.exit(close(con))
keep <- FALSE
} else if(inherits(file, "rawConnection")) {
con <- file
file <- NA_character_
keep <- TRUE
} else if(inherits(file, "file")) {
con <- file
file <- summary(con)$description
keep <- TRUE
} else {
stop(gettextf("%s must be a character string or a file/raw connection",
sQuote("file")),
domain = NA)
}
## Read header.
.con_seek(con, 0L)
header <- rawToChar(read_next_bytes_until_whitespace(con))
if(substring(header, 1L, 5L) != "%PDF-")
stop("PDF header not found")
## Go to the end.
.con_seek(con, -1L, 3L)
## Record file size as number of bytes.
nbytes <- .con_seek(con) + 1L
## Check footer.
bytes <- raw()
while(!length(bytes))
bytes <- read_prev_bytes_after_eols(con)
if(rawToChar(bytes) != "%%EOF")
stop("EOF marker not found")
## Find startxref entry (the location of the xref table).
## See PDF Reference version 1.7 section 3.4.4:
## The last line of the file contains only the end-of-file marker,
## %%EOF.
## The two preceding lines contain the keyword startxref and the
## byte offset from the beginning of the file to the beginning of
## the xref keyword in the last cross-reference section.
## But as of 2011-09-12 there is at least one PDF in CRAN's packages
## (limSolve/inst/doc/JSS-373-fig1.pdf) which has keyword and offset
## in the same line.
## Hence, let's be nice, and read back over any white-space and not
## just eols.
bytes <- read_prev_bytes_after_bytes(con, pdf_bytes_whitespaces)
startxref <- suppressWarnings(as.integer(rawToChar(bytes)))
bytes <- read_prev_bytes_after_bytes(con, pdf_bytes_whitespaces)
if(substring(rawToChar(bytes), 1L, 9L) != "startxref")
stop("cannot find 'startxref' keyword")
xref_tabs <-
matrix(integer(), nrow = 0L, ncol = 4L,
dimnames = list(NULL, c("num", "pos", "gen", "use")))
xref_objs <-
matrix(integer(), nrow = 0L, ncol = 3L,
dimnames = list(NULL, c("num", "str", "idx")))
trailer <- list()
## Some PDFs have the offset to the xref table wrong. As of
## 2011-08-24,
## * gdata/inst/doc/gregmisc.pdf points to the char after the x
## * gplots/inst/doc/BalloonPlot.pdf points to after xref
find_xref_start <- function(con) {
## Skip backwards over whitespace, then read back until the next
## whitespace.
.con_seek(con, -1L, 2L)
repeat {
x <- .con_read_bytes(con, 1L)
if(!(x %.IN.% pdf_bytes_whitespaces)) break
.con_seek(con, -2L, 2L)
}
repeat {
.con_seek(con, -2L, 2L)
x <- .con_read_bytes(con, 1L)
if(x %.IN.% pdf_bytes_whitespaces) break
}
pos <- .con_seek(con)
if(rawToChar(.con_read_bytes(con, 4L)) == "xref")
pos
else
stop("cannot find xref table")
}
## Load the xref info.
repeat {
.con_seek(con, startxref)
x <- .con_read_bytes(con, 1L)
if(x == charToRaw("x")) {
## A standard cross-reference table, hopefully.
bytes <- read_next_bytes_until_whitespace(con)
if(!rawToChar(bytes) == "ref")
stop("cannot read xref table")
read_next_non_whitespace_and_seek_back(con)
repeat {
bytes <- read_next_bytes_until_whitespace(con)
num <- suppressWarnings(as.integer(rawToChar(bytes)))
read_next_non_whitespace_and_seek_back(con)
bytes <- read_next_bytes_until_whitespace(con)
size <- suppressWarnings(as.integer(rawToChar(bytes)))
read_next_non_whitespace_and_seek_back(con)
cnt <- 0
entries <- list()
while(cnt < size) {
bytes <- .con_read_bytes(con, 20L)
## Assume that all lines were correctly at 20 bytes.
## (Could also try to be nice ...)
entry <- c(num,
as.integer(rawToChar(bytes[ 1L : 10L])),
as.integer(rawToChar(bytes[12L : 16L])),
bytes[18L] == 0x6e) # charToRaw("n") => 6e
entries <- c(entries, list(entry))
cnt <- cnt + 1L
num <- num + 1L
}
xref_tabs <- rbind(xref_tabs, do.call(rbind, entries))
read_next_non_whitespace_and_seek_back(con)
if(rawToChar(.con_read_bytes(con, 7L)) != "trailer")
.con_seek(con, -7L, 2L)
else break
}
## Read trailer info.
read_next_non_whitespace_and_seek_back(con)
new_trailer <- pdf_read_object(con)
## Merge with current trailer info.
trailer[names(new_trailer)] <- new_trailer
## If the trailer info has a /Prev key, then redo the above
## with the corresponding value the new startxref.
startxref <- new_trailer[["Prev"]]
if(is.null(startxref)) break
} else if(x %.IN.% pdf_bytes_digits) {
## PDF 1.5+ cross-reference stream, hopefully.
.con_seek(con, -1L, 2L)
pos <- .con_seek(con)
hdr <- pdf_read_object_header(con)
obj <- pdf_read_object(con)
if(!("Type" %in% names(obj)) ||
!(obj[["Type"]] == "XRef")) {
## Something's wrong.
## Try finding the xref table before pos.
.con_seek(con, pos)
startxref <- find_xref_start(con)
next
}
size <- obj[["Size"]]
index <- obj[["Index"]]
index <- if(is.null(index)) {
## Use default [0 Size].
cbind(0, size)
} else {
matrix(unlist(index), ncol = 2L, byrow = TRUE)
}
field_sizes <- unlist(obj[["W"]])
stream <- rawConnection(PDF_Stream_get_data(obj))
for(i in seq_len(nrow(index))) {
num <- index[i, 1L]
cnt <- 0L
while(cnt < index[i, 2L]) {
bytes <- .con_read_bytes(stream, field_sizes[1L])
d1 <- strtoi(paste(bytes, collapse = ""), 16L)
bytes <- .con_read_bytes(stream, field_sizes[2L])
d2 <- strtoi(paste(bytes, collapse = ""), 16L)
bytes <- .con_read_bytes(stream, field_sizes[3L])
d3 <- strtoi(paste(bytes, collapse = ""), 16L)
## Might actually need to overwrite entries.
## Worry about that lateron ...
if(d1 == 1) {
xref_tabs <-
rbind(xref_tabs, c(num, d2, d3, TRUE))
} else if(d1 == 2) {
xref_objs <-
rbind(xref_objs, c(num, d2, d3))
}
cnt <- cnt + 1L
num <- num + 1L
}
}
close(stream)
keys <- c("Root", "Encrypt", "Info", "ID")
pos <- match(keys, names(obj), nomatch = 0L)
trailer[keys[pos > 0L]] <- obj[pos]
startxref <- obj[["Prev"]]
if(is.null(startxref)) break
} else {
.con_seek(con, -1L, 2L)
startxref <- find_xref_start(con)
}
}
## Determine the "active" objects (all objects in cross-reference
## streams and objects in cross-reference tables marked in use and
## with a "real" position).
ind <- (xref_tabs[, "pos"] > 0L) & (xref_tabs[, "use"] > 0L)
names <-
c(.ref_to_name(xref_tabs[ind, c("num", "gen"), drop = FALSE]),
if(length(num <- xref_objs[, "num"]))
.ref_to_name(cbind(num, 0)))
## Build an index of names as lists of object numbers with
## corresponding generation numbers in decreasing order.
nums <- xref_objs[, "num"]
gens <- rep.int(0L, length(nums))
nums <- c(nums, xref_tabs[ind, "num"])
gens <- c(gens, xref_tabs[ind, "gen"])
gens_by_nums <-
lapply(split(as.integer(gens), nums), sort, decreasing = TRUE)
y <- new.env(parent = emptyenv())
y$file <- file
y$size <- nbytes
y$header <- header
y$xref_tabs <- xref_tabs
y$xref_objs <- xref_objs
y$trailer <- trailer
y$cache <- cache
y$con <- if(keep) con else NULL
y$names <- names
y$length <- length(names)
y$gens_by_nums <- gens_by_nums
y$objects <- list()
class(y) <- "pdf_doc"
y
}
print.pdf_doc <-
function(x, ...)
{
writeLines(strwrap(sprintf("PDF document (file \"%s\", %d bytes, %d objects)",
x$file, x$size, x$length),
exdent = 4L))
invisible(x)
}
length.pdf_doc <-
function(x)
x$length
names.pdf_doc <-
function(x)
x$names
`[.pdf_doc` <-
function(x, i)
{
if(!inherits(x, "pdf_doc")) stop("wrong class")
if(missing(i)) return(pdf_doc_get_objects(x))
i <- as.character(i) # For now ...
out <- vector("list", length(i))
pos <- .match_ids_to_pdf_doc_names(i, x)
nms <- x$names[pos]
names(out) <- nms
ind <- !is.na(pos)
out[ind] <- pdf_doc_get_objects(x, nms[ind])
out
}
`[[.pdf_doc` <-
function(x, i)
{
if(!inherits(x, "pdf_doc")) stop("wrong class")
if(missing(i) || (!(len <- length(i <- as.character(i)))))
stop("attempting to select less than one element")
else if(len > 1L)
stop("attempting to select more than one element")
pos <- .match_ids_to_pdf_doc_names(i, x)
if(is.na(pos)) return(NULL)
pdf_doc_get_object(x, x$names[pos])
}
## * pdf_catalog
pdf_catalog <-
function(file)
{
doc <- if(inherits(file, "pdf_doc")) file else pdf_doc(file)
pdf_doc_get_object(doc, doc$trailer[["Root"]])
}
## * pdf_fonts
##
## Currently only extracts the fonts used in pages (but not, e.g.,
## forms).
##
pdf_fonts <-
function(file)
{
doc <- if(inherits(file, "pdf_doc"))
file
else
pdf_doc(file, cache = FALSE)
con <- file(doc$file, "rb")
on.exit(close(con))
## Get the page resources.
resources <- pdf_doc_get_page_resources(doc, con)
## Get the font resources (actually, their references).
frefs <- list()
for(res in resources) {
fonts <- pdf_dereference_maybe(res[["Font"]], doc, con)
for(ref in fonts) {
if(is.na(match(list(ref), frefs))) {
frefs <- c(frefs, list(ref))
}
}
}
if(!length(frefs)) {
tab <- data.frame(name = character(),
type = character(),
emb = logical(),
sub = logical(),
uni = logical(),
enc = logical(),
num = integer(),
gen = integer(),
stringsAsFactors = FALSE)
tab$enc <- list()
return(tab)
}
## Now get the referenced font objects and extract some basic
## information in the style of pdffonts(1).
## emb "yes" if the font is embedded in the PDF file
## sub "yes" if the font is a subset
## uni "yes" if there is an explicit "ToUnicode" map in the PDF file
## (the absence of a ToUnicode map doesn't necessarily mean that
## the text can't be converted to Unicode)
## Re subset, see section 5.5.3 "Font Subsets" in the PDF ref:
## For a font subset, the PostScript name of the font--the value
## of the font's BaseFont entry and the font descriptor's FontName
## entry--begins with a tag followed by a plus sign (+). The tag
## consists of exactly six uppercase letters; the choice of
## letters is arbitrary, but different subsets in the same PDF
## file must have different tags.
## Also extract /Encoding information:
## A specification of the font's character encoding if different
## from its built-in encoding. The value of Encoding is either the
## name of a predefined encoding (MacRomanEncoding,
## MacExpertEncoding, or WinAnsiEncoding, as described in Appendix
## D) or an encoding dictionary that specifies differences from
## the font's built-in encoding or from a specified predefined
## encoding (see Section 5.5.5, "Character Encoding").
tab <- lapply(frefs,
function(ref) {
obj <- pdf_doc_get_object(doc, ref, con)
base <- obj[["BaseFont"]]
## See PDF Reference version 1.7 section 5.5.4.
## Type 3 font dictionaries have no BaseFont entry.
## (Guess they are always embedded?)
if(is.null(base))
base <- "[none]"
else if(inherits(base,
"PDF_Indirect_Reference")) {
base <- pdf_doc_get_object(doc, base, con)
}
list(base,
obj[["Subtype"]],
((base == "[none]") ||
!is.null(obj[["FontDescriptor"]])),
grepl("^[[:upper:]]{6}\\+", base, perl = TRUE),
!is.null(obj[["ToUnicode"]]),
obj[["Encoding"]],
ref["num"],
ref["gen"])
})
tab <- as.data.frame(do.call(rbind, tab))
names(tab) <-
c("name", "type", "emb", "sub", "uni", "enc", "num", "gen")
##
## This turns name and type from lists of PDF names to character.
## Maybe add methods for combining PDF_Name objects lateron ...
ind <- names(tab) != "enc"
tab[ind] <- lapply(tab[ind], unlist)
##
##
## Do something useful to the encoding information.
## For now, these are really NULL or indirect references ...
##
class(tab) <- c("pdf_fonts", "data.frame")
tab
}
format.pdf_fonts <-
function(x, ...)
{
y <- format.data.frame(x, ...)
## For now, simply don't show the encoding information.
## Alternatively:
## enc <- character(length(x$enc))
## ind <- vapply(x$enc, is.null, FALSE)
## enc[!ind] <- sapply(x$enc[!ind], format)
## y$enc <- enc
y$enc <- NULL
y
}
print.pdf_fonts <-
function(x, ...)
{
print.data.frame(format(x, ...))
invisible(x)
}
## * pdf_info
pdf_info <-
function(file)
{
doc <- if(inherits(file, "pdf_doc"))
file
else
pdf_doc(file, cache = FALSE)
if(is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
ref <- doc$trailer[["Info"]]
if(is.null(ref)) {
info <- list()
} else {
info <- unclass(pdf_doc_get_object(doc, ref, con))
## Be nice (the PDF Reference does not explicitly say that
## values in the Document Information Dictionary may be indirect
## references):
ind <- as.logical(sapply(info, inherits,
"PDF_Indirect_Reference"))
if(any(ind))
info[ind] <-
lapply(info[ind],
function(ref) pdf_doc_get_object(doc, ref, con))
## Transform text strings.
## See PDF Reference version 1.7 section 10.2.1:
## The value associated with any key not specifically
## mentioned in Table 10.2 must be a text string.
## If they are not PDF strings, drop them.
keys <- c("CreationDate", "ModDate", "Trapped")
pos <- which(is.na(match(names(info), keys)))
ind <- as.logical(sapply(info[pos], inherits, "PDF_String"))
info[pos[ind]] <-
lapply(info[pos[ind]], PDF_Text_String_to_character)
info[pos[!ind]] <- NULL
## Transform trapping information.
## Could provide default value "unknown".
## Transform dates to POSIXt if possible.
if(!is.null(dt <- info[["CreationDate"]]))
info[["CreationDate"]] <- PDF_Date_to_POSIXt(dt)
if(!is.null(dt <- info[["ModDate"]]))
info[["ModDate"]] <- PDF_Date_to_POSIXt(dt)
## Alternatively, use
## keys <- c("CreationDate", "ModDate")
## ind <- !is.na(match(names(info), keys))
## info[ind] <- lapply(info[ind], PDF_Date_to_POSIXt)
}
pages <- pdf_doc_get_page_list(doc, con)
info[["Pages"]] <- length(pages)
rectangles <- lapply(pages, `[[`, "MediaBox")
urx <- unlist(lapply(rectangles, `[[`, 3L))
ury <- unlist(lapply(rectangles, `[[`, 4L))
if((length(uurx <- unique(urx)) == 1L) &&
(length(uury <- unique(ury)) == 1L)) {
info["Page size"] <- sprintf("%s x %s pts", uurx, uury)
pos <- which((abs(pdf_page_sizes[, 1L] - uurx) < 1) &
(abs(pdf_page_sizes[, 2L] - uury) < 1))
if(!length(pos)) {
pos <- which((abs(pdf_page_sizes[, 2L] - uurx) < 1) &
(abs(pdf_page_sizes[, 1L] - uury) < 1))
}
if(length(pos))
info["Page size"] <-
sprintf("%s [%s]",
info["Page size"],
rownames(pdf_page_sizes)[pos])
}
info[["File size"]] <- sprintf("%d bytes", doc$size)
version <- substring(doc$header, 6L)
catalog <- pdf_doc_get_object(doc, doc$trailer[["Root"]], con)
version_in_catalog <- catalog[["Version"]]
if(!is.null(version_in_catalog)) {
## Use version in catalog only if given (as a name) and later
## than the version specified in the header.
if(as.numeric_version(version_in_catalog) > version)
version <- version_in_catalog
}
info[["PDF version"]] <- version
class(info) <- "pdf_info"
info
}
format.pdf_info <-
function(x, ...)
{
formatDL(sprintf("%s:", names(x)),
sapply(x, format),
...)
}
## * Object readers
pdf_read_object <-
function(con, doc = NULL)
{
if(pdftools_debug_level() > 0L) {
bytes <- .con_read_bytes(con, 10L)
message(sprintf("looking at %s", deparse(intToUtf8(bytes))))
.con_seek(con, -length(bytes), 2L)
}
x <- read_next_non_whitespace(con)
if(!length(x)) return(NA)
.con_seek(con, -1L, 2L)
if(x == 0x28) # charToRaw("(") => 28
pdf_read_object_string_literal(con)
else if(x == 0x2f) # charToRaw("/") => 2f
pdf_read_object_name(con)
else if(x == 0x5b) # charToRaw("[") => 5b
pdf_read_object_array(con)
##
## Handled by pdf_read_object_keyword() now.
## else if(x == "n")
## pdf_read_object_null(con)
## else if(x %in% c("t", "f"))
## pdf_read_object_boolean(con)
##
else if(x == 0x3c) { # charToRaw("<") => 3c
## Hexadecimal string or dictionary
bytes <- .con_read_bytes(con, 2L)
.con_seek(con, -2L, 2L)
if(all(bytes == c(0x3c, 0x3c)))
pdf_read_object_dictionary_or_stream(con, doc)
else
pdf_read_object_string_hexadecimal(con)
}
else if(x == 0x25) { # charToRaw("%") => 25
## Read until eol.
repeat {
x <- .con_read_bytes(con, 1L)
if(x %.IN.% pdf_bytes_eols) break
}
read_next_non_whitespace_and_seek_back(con)
pdf_read_object(con)
}
else if(x %.IN.% pdf_bytes_in_numerics_not_digits)
pdf_read_object_numeric(con)
else if(x %.IN.% pdf_bytes_digits) {
## Could be a number object or an indirect object reference.
bytes <- .con_read_bytes(con, 20L)
.con_seek(con, - length(bytes), 2L)
## Cannot simply call rawToChar(bytes) as we might have read nul
## bytes.
if(length(pos <- which(bytes == 0))) {
bytes <- bytes[seq_len(pos[1L] - 1L)]
}
if(grepl("^[[:digit:]]+\\s[[:digit:]]+\\sR[^[:alpha:]]",
rawToChar(bytes),
useBytes = TRUE))
pdf_read_object_indirect_reference(con)
else
pdf_read_object_numeric(con)
}
else {
## A keyword, hopefully.
pdf_read_object_keyword(con)
}
}
##
## Handled by pdf_read_object_keyword() now.
##
## pdf_read_object_boolean <-
## function(con)
## {
## x <- rawToChar(.con_read_bytes(con, 1L))
## if(x == "t") {
## bytes <- .con_read_bytes(con, 3L)
## if(rawToChar(bytes) == "rue")
## return(TRUE)
## } else if(x == "f") {
## bytes <- .con_read_bytes(con, 4L)
## if(rawToChar(bytes) == "alse")
## return(FALSE)
## }
## stop("cannot read boolean object")
## }
##
##
pdf_read_object_numeric <-
function(con)
{
table <- pdf_bytes_in_numerics
bytes <- raw()
while((x <- .con_read_bytes(con, 1L)) %.IN.% table) {
bytes <- c(bytes, x)
}
.con_seek(con, -1L, 2L)
s <- rawToChar(bytes)
if(grepl(".", s, fixed = TRUE))
as.numeric(s)
else
as.integer(s)
}
## PDF string objects.
## See PDF Reference version 1.7 section 3.2.3:
## A PDF string simply is a sequence of bytes.
## As this may contain nuls, we cannot unconditionally represent these
## as R character strings (which must not have embedded nulls, even with
## a "bytes" encoding), and conditionalizing the representation seems
## rather awkward.
## Hence, we represent PDF strings as a byte (raw) vector of class
## "PDF_String".
## See PDF Reference version 1.7 section 3.8.1:
## PDF has the notion of a "text string" type
## used for human-readable characters, such as text annotations,
## bookmark names, article names, and document information. These
## strings are encoded using either PDFDocEncoding or UTF-16BE with a
## leading byte-order marker.
## with note:
## This is not a true data type, but a string type that represents
## data encoded using specific conventions.
## As the context indicates when PDF strings are to be taken as PDF text
## strings (e.g. for the title or author specification in the Document
## Information Dictionary, see PDF Reference version 1.7 section 10.2.1),
## we do not use a subclass of "PDF_String" for text strings: instead,
## we use
## PDF_Text_String_to_character()
## to convert the byte vector to an R character string encoded in
## UTF-8 when the text string context applies.
## (Maybe PDF_Text_String_to_UTF8() would be a better name?)
pdf_read_object_string_literal <-
function(con)
{
x <- .con_read_bytes(con, 1L)
lparen <- charToRaw("(")
if(x != lparen)
stop("cannot read literal string object")
rparen <- charToRaw(")")
escape <- charToRaw("\\")
pdf_bytes_escape_tails <- charToRaw("nrtbf()\\")
pdf_bytes_escape_bytes <- charToRaw("\n\r\t\b\f()\\")
names(pdf_bytes_escape_bytes) <-
as.character(pdf_bytes_escape_tails)
bytes <- raw()
parens <- 1L
repeat {
x <- .con_read_bytes(con, 1L)
if(!length(x)) break
if(x == lparen) {
parens <- parens + 1L
} else if(x == rparen) {
parens <- parens - 1L
if(!parens) break
} else if(x == escape) {
x <- .con_read_bytes(con, 1L)
if(x %.IN.% pdf_bytes_digits) {
i <- 0L
while(i < 2L) {
y <- .con_read_bytes(con, 1L)
if(!(y %.IN.% pdf_bytes_digits)) {
.con_seek(con, -1L, 2L)
break
}
x <- c(x, y)
i <- i + 1L
}
x <- as.raw(strtoi(rawToChar(x), 8L))
} else if(x %.IN.% pdf_bytes_escape_tails) {
x <- pdf_bytes_escape_bytes[as.character(x)]
} else if(x %.IN.% pdf_bytes_eols) {
x <- .con_read_bytes(con, 1L)
if(!(x %.IN.% pdf_bytes_eols))
.con_seek(con, -1L, 2L)
x <- raw()
}
## See PDF Reference version 1.7 section 3.2.3.
## If the character following the backslash is not not a
## special character for an escape sequence, the backslash
## is ignored.
}
bytes <- c(bytes, x)
}
class(bytes) <- "PDF_String"
bytes
}
pdf_read_object_string_hexadecimal <-
function(con)
{
x <- .con_read_bytes(con, 1L)
if(x != 0x3c) # charToRaw("<") => 3c
stop("cannot read hexadecimal string object")
end <- charToRaw(">")
## See PDF Reference version 1.7 section 3.2.3:
## Each pair of hexadecimal digits defines one byte of the string.
## White-space characters are ignored.
## If the final digit of a hexadecimal string is missing (i.e., if
## there is an odd number of digits) it is assumed to be 0.
bytes <- raw()
repeat {
x <- .con_read_bytes(con, 1L)
if(x == end) break
if(!(x %.IN.% pdf_bytes_whitespaces))
bytes <- c(bytes, x)
}
if(length(bytes) %% 2)
bytes <- c(bytes, charToRaw("0"))
n <- length(bytes) %/% 2
s <- substring(rawToChar(bytes),
seq(1L, by = 2L, length.out = n),
seq(2L, by = 2L, length.out = n))
bytes <- as.raw(strtoi(s, 16L))
class(bytes) <- "PDF_String"
bytes
}
format.PDF_String <-
function(x, ...)
{
sprintf("PDF_String(<%s>)", paste(as.character(x), collapse = ""))
}
## PDF name objects.
pdf_read_object_name <-
function(con)
{
## See PDF Reference version 1.7 section 3.2.4:
## A slash character (/) introduces a name. The slash is not part
## of the name but is a prefix indicating that the following
## sequence of characters constitutes a name.
##
## The name may include any regular characters, but not delimiter
## or white-space characters.
##
## Beginning with PDF 1.2, any character except null (character
## code 0) may be included in a name by writing its 2-digit
## hexadecimal code, preceded by the number sign character (#).
x <- .con_read_bytes(con, 1L)
if(x != 0x2f) # charToRaw("/") => 2f
stop("cannot read name object")
bytes <- raw()
repeat {
x <- .con_read_bytes(con, 1L)
if(!length(x) || (x %.IN.% pdf_bytes_non_regulars)) {
.con_seek(con, -1L, 2L)
break
}
bytes <- c(bytes, x)
}
## Cf. also URLdecode().
pos <- which(bytes == 0x23) # charToRaw("#") => 23
if(length(pos)) {
hex <- sapply(pos,
function(p) rawToChar(bytes[p + (1L : 2L)]))
bytes[pos] <- as.raw(strtoi(hex, 16L))
bytes <- bytes[- c(pos + 1L, pos + 2L)]
}
## Note that we currently leave the leading slash as part of the
## name.
s <- rawToChar(bytes)
class(s) <- "PDF_Name"
s
}
print.PDF_Name <-
function(x, ...)
{
print(noquote(unclass(x)), ...)
invisible(x)
}
pdf_read_object_array <-
function(con)
{
x <- .con_read_bytes(con, 1L)
if(x != 0x5b) # charToRaw("[") => 5b
stop("cannot read array object")
end <- charToRaw("]")
y <- list()
repeat {
x <- read_next_non_whitespace_and_seek_back(con)
if(x == end) {
.con_read_bytes(con, 1L)
break
}
y <- c(y, list(pdf_read_object(con)))
}
class(y) <- "PDF_Array"
y
}
format.PDF_Array <-
function(x, ...)
{
sprintf("PDF_Array(%d)", length(x))
}
pdf_read_object_dictionary_or_stream <-
function(con, doc = NULL)
{
bytes <- .con_read_bytes(con, 2L)
if(!all(bytes == c(0x3c, 0x3c)))
stop("cannot read dictionary object")
end <- charToRaw(">")
y <- list()
repeat {
x <- read_next_non_whitespace_and_seek_back(con)
if(x == end) {
.con_read_bytes(con, 2L)
break
}
key <- pdf_read_object(con)
read_next_non_whitespace_and_seek_back(con)
val <- pdf_read_object(con)
y[key] <- list(val)
}
pos <- .con_seek(con)
## Check whether this is in fact a stream object.
## Read ahead.
read_next_non_whitespace_and_seek_back(con)
if(rawToChar(.con_read_bytes(con, 6L)) == "stream") {
## Argh. Handle the EOL marker assuming compliance: should
## check for this.
eol <- .con_read_bytes(con, 1L)
if(!(eol %.IN.% pdf_bytes_eols))
stop("cannot read stream object")
if(eol == charToRaw("\r"))
.con_read_bytes(con, 1L)
## Need length information in dictionary.
len <- y[["Length"]]
if(is.null(len))
stop("cannot read stream object")
## However, the length could be an indirect object reference.
## In this case we can only resolve the length for a non-NULL
## doc with the cross-reference needed. Otherwise, we record
## the stream data start, and defer reading the bytes.
if((ref <- inherits(len, "PDF_Indirect_Reference")) &&
is.null(doc)) {
y[["__stream_start__"]] <- .con_seek(con)
y[["__stream_bytes__"]] <- NULL
} else {
if(ref) {
tell <- .con_seek(con)
len <- pdf_doc_get_object(doc, len, con)
.con_seek(con, tell)
}
y[["__stream_bytes__"]] <- .con_read_bytes(con, len)
## Now check if we really hit the end of the stream.
read_next_non_whitespace_and_seek_back(con)
bytes <- .con_read_bytes(con, 9L)
if(rawToChar(bytes) != "endstream")
stop("cannot read stream object")
}
class(y) <- "PDF_Stream"
} else {
.con_seek(con, pos)
class(y) <- "PDF_Dictionary"
}
y
}
format.PDF_Dictionary <-
function(x, ...)
{
sprintf("PDF_Dictionary(<<%s>>)",
paste(names(x), collapse = ","))
}
format.PDF_Stream <-
function(x, ...)
{
sprintf("PDF_Stream(<<%s>>)",
paste(names(x), collapse = ","))
}
## Experimental summary methods.
## Cannot easily make this the print method, because PDF dictionary
## and stream objects can be recursive ...
summary.PDF_Dictionary <-
function(object, ...)
writeLines(sprintf("%s: %s", names(object), sapply(object, format)))
summary.PDF_Stream <-
function(object, ...)
{
if(!is.null(bytes <- object[["__stream_bytes__"]]))
object[["__stream_bytes__"]] <-
sprintf("%d bytes", length(bytes))
writeLines(sprintf("%s: %s", names(object), sapply(object, format)))
}
##
## Handled by pdf_read_object_keyword() now.
##
## pdf_read_object_null <-
## function(con)
## {
## bytes <- .con_read_bytes(con, 4L)
## if(rawToChar(bytes) != "null")
## stop("cannot read null object")
## NULL
## }
##
##
pdf_read_object_indirect_reference <-
function(con)
{
num <- read_next_bytes_until_whitespace(con)
read_next_non_whitespace_and_seek_back(con)
gen <- read_next_bytes_until_whitespace(con)
read_next_non_whitespace_and_seek_back(con)
x <- .con_read_bytes(con, 1L)
if(x != 0x52) # charToRaw("R") => 52
stop("cannot read indirect reference object")
y <- c(num = as.integer(rawToChar(num)),
gen = as.integer(rawToChar(gen)))
class(y) <- "PDF_Indirect_Reference"
y
}
format.PDF_Indirect_Reference <-
function(x, ...)
{
sprintf("PDF_Indirect_Reference(%d,%d)", x["num"], x["gen"])
}
pdf_dereference_maybe <-
function(obj, doc, con = NULL)
{
if(inherits(obj, "PDF_Indirect_Reference"))
obj <- pdf_doc_get_object(doc, obj, con)
obj
}
pdf_read_object_keyword <-
function(con)
{
bytes <- raw()
repeat {
x <- .con_read_bytes(con, 1L)
if(!length(x) || !(x %.IN.% pdf_bytes_in_keywords)) {
.con_seek(con, -1L, 2L)
break
}
bytes <- c(bytes, x)
}
s <- rawToChar(bytes)
if(s == "null")
NULL
else if(s == "true")
TRUE
else if(s == "false")
FALSE
else {
class(s) <- "PDF_Keyword"
s
}
}
print.PDF_Keyword <-
function(x, ...)
{
print(noquote(unclass(x)), ...)
invisible(x)
}
pdf_read_object_header <-
function(con)
{
## Read num and gen.
read_next_non_whitespace_and_seek_back(con)
num <- read_next_bytes_until_whitespace(con)
read_next_non_whitespace_and_seek_back(con)
gen <- read_next_bytes_until_whitespace(con)
## Now skip the "obj".
.con_read_bytes(con, 3L)
read_next_non_whitespace_and_seek_back(con)
c(num = suppressWarnings(as.integer(rawToChar(num))),
gen = suppressWarnings(as.integer(rawToChar(gen))))
}
pdf_read_indirect_object_at_pos <-
function(con, pos, num = NA_integer_, gen = NA_integer_, doc = NULL)
{
## Move to pos.
.con_seek(con, pos)
## Read header first.
hdr <- pdf_read_object_header(con)
## Be paranoid.
if(anyNA(hdr))
stop(gettextf("cannot find object header at xrefed position %d",
pos),
domain = NA)
## Apparently it is feasible to have cross-references to indirect
## objects with actually different object and/or generation numbers:
## as of 2011-09-27, grImport/inst/doc/Rnewspage27.pdf had both
## objects 69 and 70 point to the same offset [providing object 70].
## For now, give a message and proceed.
if(!is.na(num) && (num != hdr["num"]))
message(gettextf("mismatch in object numbers (given: %d, found: %d)",
num, hdr["num"]),
domain = NA)
if(!is.na(gen) && (gen != hdr["gen"]))
message(gettextf("mismatch in generation numbers (given: %d, found: %d)",
gen, hdr["gen"]),
domain = NA)
## Read object.
pdf_read_object(con, doc)
}
pdf_read_stream_bytes <-
function(con, obj, doc = NULL)
{
len <- obj[["Length"]]
if(inherits(len, "PDF_Indirect_Reference")) {
len <- pdf_doc_get_object(doc, len, con)
}
pos <- obj[["__stream_start__"]]
.con_seek(con, pos)
bytes <- .con_read_bytes(con, len)
## Check if we really hit the end of the stream.
read_next_non_whitespace_and_seek_back(con)
if(rawToChar(.con_read_bytes(con, 9L)) != "endstream")
stop("cannot read stream data")
bytes
}
pdf_doc_get_object <-
function(doc, ref, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
if(is.character(ref)) {
ref <- as.integer(unlist(strsplit(ref, ".", fixed = TRUE)))
}
if(length(ref) == 1L) {
names(ref) <- "num"
} else if(length(ref) == 2L) {
names(ref) <- c("num", "gen")
}
num <- ref["num"]
gen <- ref["gen"]
## First look in the object cache.
if(doc$cache) {
pos <- match(.ref_to_name(ref), names(doc$objects))
if(!is.na(pos)) return(doc$objects[[pos]])
}
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
## Next look in the xrefs for object streams.
if((is.na(gen) || (gen == 0L)) &&
(length(pos <- which(doc$xref_objs[, "num"] == num)))) {
if(length(pos) > 1L) {
## Can this really happen?
pos <- pos[1L]
}
ptr <- doc$xref_objs[pos, ]
idx <- ptr["idx"]
obj <- pdf_doc_get_object(doc, ptr["str"], con)
## Could check whether
## obj[["Type"]] == "ObjStm"
n <- obj[["N"]]
if(idx >= n)
stop("invalid index in object stream lookup")
first <- obj[["First"]]
stream <- rawConnection(PDF_Stream_get_data(obj, doc))
on.exit(close(stream), add = TRUE)
i <- 0L
if(doc$cache) {
while(i < n) {
cnum <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
cpos <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
tell <- .con_seek(stream, first + cpos)
obj <- pdf_read_object(stream, doc)
key <- .ref_to_name(cnum)
doc$objects[key] <- list(obj)
.con_seek(stream, tell)
i <- i + 1L
}
return(doc$objects[[.ref_to_name(num)]])
} else {
while(i <= idx) {
cnum <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
cpos <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
i <- i + 1L
}
.con_seek(stream, obj[["First"]] + cpos)
return(pdf_read_object(stream, doc))
}
}
## Figure out the position to start from.
if(length(ref) == 1L) {
pos <- which.max(doc$xref_tabs[, "num"] == ref)
gen <- doc$xref_tabs[pos, "gen"]
pos <- doc$xref_tabs[pos, "pos"]
}
else {
ind <- ((doc$xref_tabs[, "num"] == num) &
(doc$xref_tabs[, "gen"] == gen))
pos <- doc$xref_tabs[ind, "pos"]
}
obj <- pdf_read_indirect_object_at_pos(con, pos, num, gen, doc)
if(doc$cache)
doc$objects[.ref_to_name(c(num, gen))] <- list(obj)
obj
}
pdf_doc_get_objects <-
function(doc, ids = NULL, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
## Start with the object cache.
objects <- doc$objects
## If this contains as many objects as there are names, all objects
## have already been cached.
if(length(objects) == doc$length) {
if(!length(ids))
return(objects)
else
return(objects[ids])
}
## Otherwise, we need to get the objects not yet in the cache (which
## could be all objects if caching is off, of course).
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
debug <- (pdftools_debug_level() > 0L)
## First get the objects from the old-style xref tables.
tab <- doc$xref_tabs
str <- .ref_to_name(doc$xref_objs[, "str"], "0")
## Determine the active objects needed.
tab <- tab[((tab[, "pos"] > 0L) & (tab[, "use"] > 0L)), ,
drop = FALSE]
## If ids is NULL (so that we are getting all active objects), we
## need those active objects not yet in the cache.
ind <- is.na(match(.ref_to_name(tab[, c("num", "gen"),
drop = FALSE]),
names(objects)))
## Otherwise, we only need those active objects not yet in the cache
## which are directly matched by ids, or which contain streams with
## objects matched by ids.
if(length(ids)) {
nms <- .ref_to_name(tab[ind, c("num", "gen"), drop = FALSE])
wanted_by_direct_match <- !is.na(match(nms, ids))
if(!length(str)) {
ind <- ind & wanted_by_direct_match
} else {
pos <- match(ids,
.ref_to_name(doc$xref_objs[, "num"], "0"),
nomatch = 0L)
str <- str[pos]
wanted_by_stream_match <- !is.na(match(nms, unique(str)))
ind <- ind & (wanted_by_direct_match | wanted_by_stream_match)
}
}
for(i in which(ind)) {
entry <- tab[i, ]
if(debug)
message(sprintf("processing %s",
paste(names(entry), entry, collapse = " ")))
pos <- entry["pos"]
num <- entry["num"]
gen <- entry["gen"]
obj <- pdf_read_indirect_object_at_pos(con, pos, num, gen, doc)
key <- .ref_to_name(c(num, gen))
if(doc$cache)
doc$objects[key] <- list(obj)
objects[key] <- list(obj)
}
## Now for the new-style xref streams objects.
for(s in unique(str)) {
obj <- objects[[s]]
n <- obj[["N"]]
first <- obj[["First"]]
stream <- rawConnection(PDF_Stream_get_data(obj, doc))
tab <- matrix(0, n, 2L)
## First read the object numbers and byte offsets.
i <- 1L
while(i <= n) {
tab[i, 1L] <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
tab[i, 2L] <- pdf_read_object(stream)
read_next_non_whitespace_and_seek_back(stream)
i <- i + 1L
}
## Determine the objects still needed.
pos <- which(is.na(match(.ref_to_name(tab[, 1L], 0),
names(objects))))
if(length(ids)) {
pos <- pos[!is.na(match(.ref_to_name(tab[pos, 1L], 0), ids))]
}
## Then read the objects from the stream.
for(i in pos) {
.con_seek(stream, first + tab[i, 2L])
obj <- pdf_read_object(stream, doc)
key <- .ref_to_name(tab[i, 1L])
if(doc$cache)
doc$objects[key] <- list(obj)
objects[key] <- list(obj)
}
close(stream)
}
if(length(ids))
objects <- objects[ids]
objects
}
## * pdf_doc_get_page_tree
pdf_doc_get_page_tree <-
function(doc, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
debug <- (pdftools_debug_level() > 0L)
catalog <- pdf_doc_get_object(doc, doc$trailer[["Root"]], con)
## Pages entry in the catalog dictionary is required and must be an
## indirect reference.
pages <- pdf_doc_get_object(doc, catalog[["Pages"]], con)
recurse <- function(x) {
if(!is.null(kids <- x[["Kids"]])) {
x[["Kids"]] <-
lapply(kids,
function(kid)
if(inherits(kid, "PDF_Indirect_Reference")) {
if(debug) {
message(sprintf("expanding %s",
format(kid)))
}
recurse(pdf_doc_get_object(doc, kid, con))
} else {
kid
})
class(x) <- "PDF_Page_Tree"
} else {
## No more kids, should be a leaf node.
## Could check on Type being Page.
class(x) <- "PDF_Page"
}
x
}
recurse(pages)
}
pdf_doc_get_page_list <-
function(doc, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
## See PDF Reference version 1.7 section 3.6.2.
## Page objects can inherit
## Resources MediaBox CropBox Rotate
## from its ancestors.
## Hence, we flattening out the tree to the list of pages (leaf
## nodes of the page tree), propagate these entries.
inherited_entry_names <-
c("Resources", "MediaBox", "CropBox", "Rotate")
pages <- list()
## Cannot use rapply() because this only deals with nodes which are
## not lists.
recurse <- function(x) {
if(!is.null(kids <- x[["Kids"]])) {
entries <- x[inherited_entry_names]
for(kid in kids) {
cls <- class(kid)
kid <- c(kid, entries[is.na(match(names(entries),
names(kids)))])
class(kid) <- cls
if(inherits(kid, "PDF_Page")) {
pages <<- c(pages, list(kid))
}
recurse(kid)
}
}
}
recurse(pdf_doc_get_page_tree(doc, con))
pages
}
## * pdf_doc_get_content_streams
##
## Most likely this should only extract the Contents entries and let the
## "consumers" do the expansions of indirect object references as needed.
##
pdf_doc_get_page_content_streams <-
function(doc, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
pages <- pdf_doc_get_page_list(doc, con)
## See PDF Reference version 1.7 section 3.6.2.
## A page object may have a Contents entry with value a single
## content stream or an array of such streams.
## A missing Contents entry means that the page is empty.
lapply(pages,
function(p) {
obj <- p[["Contents"]]
if(inherits(obj, "PDF_Array"))
lapply(obj, pdf_dereference_maybe, doc, con)
else
pdf_dereference_maybe(obj, doc, con)
})
}
##
## Most likely this should only extract the Resources entries and let the
## "consumers" do the expansions of indirect object references as needed
## (as well as handle inheritance from ancestors).
##
pdf_doc_get_page_resources <-
function(doc, con = NULL)
{
if(!inherits(doc, "pdf_doc")) stop("wrong class")
if(is.null(con) && is.null(con <- doc$con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
pages <- pdf_doc_get_page_list(doc, con)
## See PDF Reference version 1.7 section 3.6.2.
## A page object may have a Resources entry giving a dictionary
## (which apparently could be an indirect object reference).
## An empty dictionary means no resources.
## A missing Resources entry means that resources are inherited from
## an ancestor node in the page tree.
lapply(pages,
function(p) {
pdf_dereference_maybe(p[["Resources"]], doc, con)
})
}
## * Streams
PDF_Stream_get_data <-
function(obj, doc = NULL)
{
bytes <- obj[["__stream_bytes__"]]
if(is.null(bytes)) {
if(is.null(doc))
stop("cannot read stream data")
con <- doc$con
if(is.null(con)) {
con <- file(doc$file, "rb")
on.exit(close(con))
}
bytes <- pdf_read_stream_bytes(con, obj, doc)
}
filters <- as.list(obj[["Filter"]])
## Handle DecodeParms.
## The PDF specs say that if there is a single filter, DecodeParms
## can be a dictionary object with the parameters, but need not be
## given if the defaults are to be used. If there are multiple
## filters and any filter has non-default parameters, DecodeParms
## must be an array with one entry for each filter: either a
## dictionary with the parameters or the null object.
parameters <- obj[["DecodeParms"]]
if(is.null(parameters))
parameters <- rep.int(list(NULL), length(filters))
else if(inherits(parameters, "PDF_Dictionary"))
parameters <- list(parameters)
for(i in seq_along(filters)) {
filter <- filters[[i]]
if(filter == "FlateDecode")
bytes <- pdf_filter_flate_decode(bytes, parameters[[i]])
else
stop(gettextf("unsupported filter %s",
sQuote(filter)),
domain = NA)
}
bytes
}
## * Filters
pdf_filter_flate_decode <-
function(x, params)
{
## Need to decompress first
m <- memDecompress(x, "gzip")
predictor <- params[["Predictor"]]
if(is.null(predictor) || (predictor == 1L))
return(m)
if((predictor < 10L) && (predictor > 15L)) {
stop(gettextf("unsupported %s predictor %d",
"flatedecode",
predictor),
domain = NA)
}
columns <- params[["Columns"]]
bytes <- raw()
rowlength <- columns + 1L
prev_rowdata <- integer(rowlength)
for(row in seq(0L, length.out = length(m) / rowlength)) {
##
## Use a rawConnection() instead.
rowdata <- as.integer(m[seq(row * rowlength + 1L,
(row + 1L) * rowlength)])
##
fb <- rowdata[1L]
if(fb == 1L) {
for(i in seq(3L, rowlength))
rowdata[i] <- (rowdata[i] + rowdata[i - 1L]) %% 256
} else if(fb == 2L) {
for(i in seq(2L, rowlength))
rowdata[i] <- (rowdata[i] + prev_rowdata[i]) %% 256
} else if(fb != 0L) {
stop(gettextf("unsupported PNG filter %d", fb),
domain = NA)
}
prev_rowdata <- rowdata
bytes <- c(bytes, as.raw(rowdata[-1L]))
}
bytes
}
## * Content streams
pdf_content_stream_read_objects <-
function(con, doc = NULL)
{
objects <- list()
while(!identical(obj <- pdf_read_object(con, doc), NA))
objects <- c(objects, list(obj))
objects
}
## * Common data structures
## ** PDF text strings
PDF_Text_String_to_character <-
function(bytes)
{
## PDF text strings can be encoded in PDFDocEncoding ot UTF-16BE.
if(identical(bytes[c(1L, 2L)], as.raw(c(0xfe, 0xff)))) {
## If the first two bytes represent the Unicode byte-order marker
## U+FEFF, this is a text string encoded in UTF-16BE.
bytes <- bytes[-c(1L, 2L)]
##
## This can contain the language encoding as
## U+001B (i.e., as.raw(c(0x00, 0x1b)))
## 2-byte ISO-639 language code
## 2-byte ISO-639 language code [optional]
## U+001B
##
language <- NULL
pos <- which(bytes == 0x00)
if(length(pos)) {
pos <- pos[bytes[pos + 1L] == 0x1b]
if(length(pos) == 2L) {
ini <- pos[1L]
len <- pos[2L] - ini
if(len == 4L) {
language <- rawToChar(bytes[c(ini + 2L, ini + 3L)])
bytes <- bytes[- (ini + (0L : 5L))]
} else if(len == 6L) {
## Use IETF language tag format.
language <-
paste(rawToChar(bytes[c(ini + 2L, ini + 3L)]),
rawToChar(bytes[c(ini + 4L, ini + 5L)]),
sep = "-")
bytes <- bytes[- (ini + (0L : 7L))]
}
}
}
s <- intToUtf8(bytes)
if(!is.null(language))
attr(s, "Language") <- language
s
} else {
intToUtf8(PDFDocEncoding[as.character(bytes)])
}
}
##
## Integrate into tools::charset_to_Unicode eventually.
PDFDocEncoding <-
c(0x0000, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0x0009, 0x000a, 0xfffd, 0x000c, 0x000d, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0xfffd,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0xfffd,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0xfffd, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff)
names(PDFDocEncoding) <- format.hexmode(0 : 255)
##
## ** PDF dates
## See PDF Reference version 1.7 section 3.8.3.
## Dates are of the form
## (D:YYYYMMDDHHmmSSOHH'mm')
## where
## * YYYY MM DD HH mm SS have the usual meanings
## * O is the relationship of local time to Universal Time (UT),
## denoted by one of the characters +, -, or Z: a plus sign (+) as
## the value of the O field signifies that local time is later
## than UT, a minus sign (-) signifies that local time is earlier
## than UT, and the letter Z signifies that local time is UT.
## * the apostrophe character after HH and mm is part of the syntax
## * all fields after the year are optional
## * the prefix 'D:', although also optional, is strongly recommended
## * the default values for MM and DD are both 01
## * all other numerical fields default to zero values.
PDF_Date_to_POSIXt <-
function(bytes)
{
## Strip optional 'D:' prefix.
s <- sub("^D:", "", rawToChar(bytes))
## Strip apostrophes in offset spec.
s <- gsub("'", "", s)
if(nchar(s) <= 14L) {
s <- sprintf("%s%s", s,
substring(" 0101000000", nchar(s) + 1L, 14L))
strptime(s, "%Y%m%d%H%M%S")
} else if(substring(s, 15L, 15L) == "Z") {
strptime(substring(s, 1L, 14L), "%Y%m%d%H%M%S")
} else {
strptime(s, "%Y%m%d%H%M%S%z")
}
}
## * Utilities
.match_ids_to_pdf_doc_names <-
function(ids, doc)
{
ids <- as.character(ids)
## Exact matches first.
pos <- match(ids, doc$names)
if(any(ind <- is.na(pos))) {
## Try expanding a num-only id to the one with highest gen.
num <- ids[ind]
yep <- match(num, names(doc$gens_by_nums), nomatch = 0L)
gen <- vapply(doc$gens_by_nums[yep], `[[`, 0L, 1L)
yep <- (yep > 0L)
pos[ind][yep] <-
match(.ref_to_name(num[yep], gen), doc$names)
}
pos
}
.ref_to_name <-
function(x, y = NULL)
{
sep <- "."
if(!length(x))
character()
else if(!is.null(y)) {
y <- rep(y, length.out = length(x))
paste(x, y, sep = sep)
}
else if(is.matrix(x))
paste(x[, 1L], x[, 2L], sep = sep)
else if(length(x) > 1L)
paste(x[1L], x[2L], sep = sep)
else
paste(x, "0", sep = sep)
}
read_next_bytes_until_whitespace <-
function(con)
{
bytes <- raw()
repeat {
x <- .con_read_bytes(con, 1L)
if(!length(x) || (x %.IN.% pdf_bytes_whitespaces)) break
bytes <- c(bytes, x)
}
bytes
}
read_next_non_whitespace <-
function(con)
{
repeat {
x <- .con_read_bytes(con, 1L)
if(!length(x) || !(x %.IN.% pdf_bytes_whitespaces)) break
}
x
}
read_next_non_whitespace_and_seek_back <-
function(con)
{
x <- read_next_non_whitespace(con)
.con_seek(con, -1L, 2L)
x
}
read_prev_bytes_after_bytes <-
function(con, set)
{
## Read the previous bytes from con until the first byte in set, and
## move point to the first preceding byte not in set.
bytes <- raw()
repeat {
x <- .con_read_bytes(con, 1L)
.con_seek(con, -2L, 2L)
if(x %.IN.% set) {
while(x %.IN.% set) {
x <- .con_read_bytes(con, 1L)
.con_seek(con, -2L, 2L)
}
.con_seek(con, 1, 2L)
break
} else {
bytes <- c(x, bytes)
}
}
bytes
}
read_prev_bytes_after_eols <-
function(con)
read_prev_bytes_after_bytes(con, pdf_bytes_eols)
raw_connection_to_bytes_in_file <-
function(file)
{
bytes <- readBin(file, "raw", file.info(file)$size)
rawConnection(bytes)
}
## * Utilities to enhance performance
## Test whether a single byte is contained in a sequence of bytes.
`%.IN.%` <-
function(x, bytes)
any(x == bytes)
## Seems that this is faster than using grepRaw(fixed = TRUE) or
## match().
## Calling seek() is inefficient.
## * 1st, it dispatched to seek.connection().
## * 2nd, this pmatches two of its arguments before calling a .Internal.
## Hence, use the following, where values 1L, 2L, 3L for the origin
## correspond to "start", "current" and "end".
.con_seek <-
function(con, where = NA, origin = 1L)
{
.Internal(seek(con, as.double(where), as.integer(origin), 0L))
}
## Calling readBin() to read bytes is somewhat inefficient.
## Hence, use the following.
.con_read_bytes <-
function(con, n = 1L)
{
.Internal(readBin(con, "raw", n, NA_integer_, FALSE, FALSE))
}
## * Utilities for debugging
pdftools_debug_level <-
local({
level <- 0L
function(new) {
if(!missing(new))
level <<- new
else
level
}
})