From 12cb67df6987ea06857c86685543d5f376d4470c Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Sun, 31 May 2026 16:29:32 -0400 Subject: [PATCH 01/10] Prepare for version 0.1.0 release; update WORDLIST --- inst/WORDLIST | 227 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 191 insertions(+), 36 deletions(-) diff --git a/inst/WORDLIST b/inst/WORDLIST index 6cc34f8..9fd3e71 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,91 +1,246 @@ -AcroForm +ABGR ADR -affine +AES +AcroForm ArtBox -bblanchon +BCP +BDC +BGR +BGRA +BGRx +BaseFont Bezier -bitmap's -bitmask BleedBox +CMD +CMap +CPDF +CTM CalGray CalRGB -CMap -CMD Codecov -codepoint -codepoints -colour -colours -conformant +Combobox CropBox -CTM DER +Dest DeviceCMYK DeviceGray DeviceN DeviceRGB DocMDP -drawable -finalisers -finalizer +EF +FitBH +FitBV +FitH +FitR +FitV Flate +FreeText GC +GID +Gerrit +GoTo ICCBased +IETF +ImageMagick JBIG +JRE +JS Lifecycle -listbox +LineTo +MCID +MacRoman MediaBox MediaBox's +MoveTo MultiLine NA's -nativeRaster -neighbour -NoExport -npc NUL +NoExport +NonFullScreenPageMode +NumCopies +Nx ORCID -PageMode +OTF +OpenAction PDF's +PDFDocEncoding PDFium PDFium's -pdftools PKCS -pkgdown PKI +POSIXct +PageLayout +PageMode +PickTrayByPDFSize Poppler +Pre +PrintArea +PrintClip +PrintPageRange +PrintScaling +RGBA +RPATH +Rasterises +Rcpp +ReadOnly +Serialises +TTF +Tabula's +Tibble +ToUnicode +TrimBox +TrueType +UA +UI +URL's +UserAccess +ViewArea +ViewClip +ViewerPreferences +Visualising +WinAnsi +XFA +XMP +XObject +XObject's +XObjects +XYZ +ZapfDingbats +affine +allowlist +analysing +annot +annot's +appender +appenders +bblanchon +behaviour +behavioural +bezierto +bitmap's +bitmask +callouts +charcodes +charset +charsets +checkable +checksums +cmap +codepoint +codepoints +colour +colours +combobox +conformant +cropbox +de +dereferencing +dest +dest's +drawable +embedder +embedder's +enum +externalptr +externalptr's +extractable +facto +filesystem +finalisers +finalizer +fitb +fitbh +fitbv +fith +fitr +fitv +flavour +focusable +getter +getters +glyph's +goto +honours +i'th +indexable +ints +lang +linearise +lineto +listbox +lossy +magick +marshalling +materialisation +mediabox +moveto +mutator +mutators +nativeRaster +neighbour +normalise +npc +op'ing +pdftk +pdftools +pkgdown +polyline positionally -POSIXct pre +pushbutton pypdfium qpdf +quadpoints +radiobutton rasterImage rasterise rasterised rasterises -Rasterises -Rcpp -ReadOnly +rasterize +readwrite recognises +rect +reflow renderer -RGBA +representable +rg rollup -RPATH +rpdfium +serialise +serialised +specialisation +squigglies +staplr +subdictionary subpath +subprocess subtype +subtypes synthesised +sys +tabulapdf teardown +tempfile +textfield textfields th tibble tibble's tibbles -TrimBox +toc +triaging +uint un unencrypted +unicode unparseable +unresolvable unstroked -UserAccess -XFA -XObject -XObject's -XObjects +untagged +uri +walkthrough +widget's +xfa +xmpdf +xobject +xyz From b97c08d871b8a48a6742ad336be7865723692c6f Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Sun, 31 May 2026 16:32:40 -0400 Subject: [PATCH 02/10] Simplify cran-comments.md --- cran-comments.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index 30d448e..84b0741 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -96,10 +96,7 @@ not yet installed. ## Reverse dependencies This is a first submission; there are no reverse dependencies -yet. The internal consumer (`kmextract`, currently using -`pypdfium2` via reticulate) will switch to `pdfium` as a backend -after this release; its conformance suite has been run against -the v0.1.0 candidate. +yet. ## Examples runtime From 8a5808e49993123575ba4ab898060414c1bf3323 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Sun, 31 May 2026 16:42:44 -0400 Subject: [PATCH 03/10] As submitted to CRAN --- .Rbuildignore | 1 + CRAN-SUBMISSION | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 CRAN-SUBMISSION diff --git a/.Rbuildignore b/.Rbuildignore index 43ba791..fb2ca94 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -26,3 +26,4 @@ ^pkgdown$ ^doc$ ^Meta$ +^CRAN-SUBMISSION$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..c95df0a --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 0.1.0 +Date: 2026-05-31 20:36:14 UTC +SHA: b97c08d871b8a48a6742ad336be7865723692c6f From e121bc96335cd429524d5f53d1bf584856a9ef1b Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Sun, 31 May 2026 17:38:24 -0400 Subject: [PATCH 04/10] Address CRAN notes and warnings --- R/api_completion.R | 2 +- R/form_fields.R | 2 +- README.Rmd | 5 +- README.md | 5 +- cran-comments.md | 153 ++++++----------------------------- man/pdf_form_fields.Rd | 2 +- man/pdf_page_bounding_box.Rd | 2 +- 7 files changed, 34 insertions(+), 137 deletions(-) diff --git a/R/api_completion.R b/R/api_completion.R index ecd9804..b161616 100644 --- a/R/api_completion.R +++ b/R/api_completion.R @@ -105,7 +105,7 @@ pdf_page_has_transparency <- function(page) { cpp_page_has_transparency(page$ptr) } -#' Page bounding box (cropbox ∩ mediabox) +#' Page bounding box (cropbox intersect mediabox) #' #' Wraps `FPDF_GetPageBoundingBox` — returns the rectangle that #' encloses the visible portion of `page` after intersecting the diff --git a/R/form_fields.R b/R/form_fields.R index 879c0f6..dd0e3b8 100644 --- a/R/form_fields.R +++ b/R/form_fields.R @@ -80,7 +80,7 @@ form_field_flag_decode <- function(flags, bit) { #' `TRUE` / `FALSE` for `checkbox` / `radiobutton` fields, #' `NA` for every other field type. #' * `control_count` integer - total number of widgets in this -#' field's control group (≥ 1; `> 1` for radio button groups +#' field's control group (`>= 1`; `> 1` for radio button groups #' with multiple physical widgets). `NA` if PDFium reports #' failure. #' * `control_index` integer - 0-based position of this row's diff --git a/README.Rmd b/README.Rmd index a49b6bb..5aae7de 100644 --- a/README.Rmd +++ b/README.Rmd @@ -105,5 +105,6 @@ package = "pdfium")`, etc.) and on the `pdfium` is MIT-licensed. The bundled `libpdfium` binary is BSD-3-Clause and is *not* distributed in the source tarball — see -[`LICENSE.md`](LICENSE.md) and -[`dev/decisions/ADR-003-binary-distribution.md`](dev/decisions/ADR-003-binary-distribution.md). +[`LICENSE.md`](https://github.com/humanpred/rpdfium/blob/main/LICENSE.md) +and +[`dev/decisions/ADR-003-binary-distribution.md`](https://github.com/humanpred/rpdfium/blob/main/dev/decisions/ADR-003-binary-distribution.md). diff --git a/README.md b/README.md index fd28b09..e648a64 100644 --- a/README.md +++ b/README.md @@ -98,5 +98,6 @@ More examples ship in the vignettes `pdfium` is MIT-licensed. The bundled `libpdfium` binary is BSD-3-Clause and is *not* distributed in the source tarball — see -[`LICENSE.md`](LICENSE.md) and -[`dev/decisions/ADR-003-binary-distribution.md`](dev/decisions/ADR-003-binary-distribution.md). +[`LICENSE.md`](https://github.com/humanpred/rpdfium/blob/main/LICENSE.md) +and +[`dev/decisions/ADR-003-binary-distribution.md`](https://github.com/humanpred/rpdfium/blob/main/dev/decisions/ADR-003-binary-distribution.md). diff --git a/cran-comments.md b/cran-comments.md index 84b0741..8323387 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,132 +2,27 @@ ## Summary -`pdfium` is a new R package providing idiomatic bindings to Google's -PDFium PDF engine via Rcpp. It complements `pdftools` (Poppler) and -`qpdf` (QPDF), filling two gaps no other CRAN package fills: - -* **Vector-path geometry on read** — segment kinds, control points, - stroke / fill style, transformation matrices, clip paths, blend - modes — alongside text, fonts, images, annotations, form fields, - attachments, signatures, structure tree, bookmarks, named - destinations, and rendering. `pdf_extract_paths()` returns a - tibble matching the schema kmextract's pypdfium2 backend ships - today. -* **A focused mutation surface** opt-in via `readwrite = TRUE` on - `pdf_doc_open()` (or `pdf_doc_new()` for fresh documents): - structural mutation (page rotate / delete / reorder / merge / - box / language), page-object styling setters, path-geometry - rebuild, page-object creation (paths, rectangles, text, JPEG - images), annotation authoring (14 supported subtypes), form - filling + flattening, attachment authoring, plus standard-font - and custom-font (TrueType / Type1) embedding. `pdf_save()` - writes atomically. - -Both halves are documented in pkgdown -() and exercised at 100% R -coverage in CI. - -## Test environments - -The R-CMD-check matrix in `.github/workflows/R-CMD-check.yaml` -covers: - -* Ubuntu 24.04, R-release / R-devel / R-oldrel-1 -* macOS-latest, R-release -* Windows-latest, R-release - -`R CMD check --as-cran` locally on Ubuntu 24.04 with R 4.6.0: -0 ERRORs, 0 WARNINGs when `checkbashisms` is installed, 1 NOTE -(detailed below). The cross-platform CI matrix -() -is green on every cell on the head of `main`. - -## Expected NOTEs - -* **"Compilation used the following non-portable flag(s): - '-mno-omit-leaf-frame-pointer'"** — inherited from the Debian / - Ubuntu `r-base` package's default `CXX17FLAGS`. The pdfium - package itself does not pass this flag in its `Makevars`; it - appears only when R itself was built on Debian-family systems - with that flag set in `etc/Makeconf`. No NOTE seen on - macOS-latest or Windows-latest CI cells. - -* **"Installed package size … Mb"** — *may appear* on systems - where `inst/lib/libpdfium` ends up at 10–15 MB (the bundled - libpdfium shared library). We download it at install time - rather than shipping it in the source tarball, so the tarball - itself is well under CRAN's 5 MB limit (~1 MB). - -* **"GNU make is a SystemRequirements"** — *may appear* on - platforms where the `configure` script triggers a GNU-make - feature. The package declares `SystemRequirements: C++17, - libpdfium (downloaded automatically at install time)` to make - this explicit; the `configure` script downloads the matching - bblanchon binary on demand and `cleanup` removes intermediate - artefacts. - -## Network access at install time - -The `configure` (POSIX) and `configure.win` (Windows) scripts -fetch the bblanchon `libpdfium` binary on first install. The -script: - -* Honors `CRAN_PDFIUM_OFFLINE=1` as a hard opt-out for the CRAN - build farm. -* Falls back to a prepopulated `inst/pdfium-binaries/` directory - when one is present. -* Errors with a clear message — and a `configure` exit code that - surfaces in `install.packages()` — when the network is - unavailable and no fallback is present. - -The pinned release URL and SHA-256 live in -`tools/pdfium-version.txt`; any change to the pin requires a new -ADR entry under `dev/decisions/`. The download URL points at -GitHub releases (`https://github.com/bblanchon/pdfium-binaries/...`) -which is in CRAN's allowlist of acceptable fetch sources for -`arrow`, `duckdb`, and other binary-heavy packages. - -No network access is required to run the package after install. -Tests use only the bundled fixtures under `inst/extdata/fixtures/`; -examples either use those fixtures or are wrapped in -`if (nzchar(fixture)) { ... }` so they no-op when the package is -not yet installed. - -## Reverse dependencies - -This is a first submission; there are no reverse dependencies -yet. - -## Examples runtime - -Every documented function has a runnable example. The longest -single example runs in under 200 ms on a 2024 Linux laptop; -the full `R CMD check` example pass completes in well under -60 seconds. No example uses `\dontrun{}`; all use -`if (nzchar(system.file(...))) { ... }` to no-op when the -fixture is missing. - -## CRAN policy compliance checklist - -* [x] No writes outside `tempdir()` and the package install - directory. -* [x] No network access during `R CMD check` (download is at - install time only; tests use bundled fixtures). -* [x] No `\dontrun{}` examples. -* [x] Examples runtime < 5 s each; full pass < 60 s. -* [x] No `<<-` writes to `.GlobalEnv` or anywhere outside the - package namespace. -* [x] No interactive prompts at install or load time. -* [x] All Suggests packages are on CRAN and used via - `requireNamespace()` / `skip_if_not_installed()` where - appropriate. -* [x] Mutators require an explicit `readwrite = TRUE` opt-in on - `pdf_doc_open()` so accidental edits inside a read-only - pipeline raise a clear error rather than silently mutating - the document. - -## Licence - -Package code: MIT (with file LICENSE). -Bundled `libpdfium` binary: BSD-3-Clause. The combined provenance -and per-file attribution live in `LICENSE.md`. +`pdfium` provides idiomatic R bindings to Google's PDFium PDF +engine via Rcpp. It exposes more of a PDF's internals — vector- +path geometry, annotations, form fields, attachments, +signatures, structure tree, named destinations, viewer +preferences, and a focused mutation surface — than any other R +package on CRAN today. + +## NOTEs we cannot eliminate + +* **"Possibly misspelled words in DESCRIPTION: PDFium"** — + "PDFium" is the proper name of the upstream Google library + this package wraps. It is already listed in `inst/WORDLIST`. + +* **"Found '\_exit' / 'abort' / 'exit'" in `pdfium.dll`** — the + flagged DLL is the package's own compiled Rcpp library. Our + C/C++ source contains zero calls to these functions (all + error paths use `Rcpp::stop()`). The symbols are linked in by + the C/C++ runtime that Rtools statically attaches to every + shared library (`libgcc` / `libstdc++` reference `abort()` + from the default terminate handler) and via the import table + for the upstream `libpdfium.dll` we link against. CRAN's own + NOTE acknowledges this case: *"The detected symbols are + linked into the code but might come from libraries and not + actually be called."* diff --git a/man/pdf_form_fields.Rd b/man/pdf_form_fields.Rd index ab52002..b0ac895 100644 --- a/man/pdf_form_fields.Rd +++ b/man/pdf_form_fields.Rd @@ -31,7 +31,7 @@ decoded universal flag bits (bits 1, 2, 3) for convenience. \code{TRUE} / \code{FALSE} for \code{checkbox} / \code{radiobutton} fields, \code{NA} for every other field type. \item \code{control_count} integer - total number of widgets in this -field's control group (≥ 1; \verb{> 1} for radio button groups +field's control group (\verb{>= 1}; \verb{> 1} for radio button groups with multiple physical widgets). \code{NA} if PDFium reports failure. \item \code{control_index} integer - 0-based position of this row's diff --git a/man/pdf_page_bounding_box.Rd b/man/pdf_page_bounding_box.Rd index cfdbdbc..bff83ca 100644 --- a/man/pdf_page_bounding_box.Rd +++ b/man/pdf_page_bounding_box.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/api_completion.R \name{pdf_page_bounding_box} \alias{pdf_page_bounding_box} -\title{Page bounding box (cropbox ∩ mediabox)} +\title{Page bounding box (cropbox intersect mediabox)} \usage{ pdf_page_bounding_box(page) } From 12e671b07e67634a438ec0199408a3dc910ab14e Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Sun, 31 May 2026 17:43:15 -0400 Subject: [PATCH 05/10] As submitted to CRAN --- CRAN-SUBMISSION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index c95df0a..844f115 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 0.1.0 -Date: 2026-05-31 20:36:14 UTC -SHA: b97c08d871b8a48a6742ad336be7865723692c6f +Date: 2026-05-31 21:42:49 UTC +SHA: e121bc96335cd429524d5f53d1bf584856a9ef1b From 5af2a5fb8f2caa0cbfce4016aa3f03ac7632de49 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 29 Jun 2026 08:45:54 -0400 Subject: [PATCH 06/10] docs(cran): single-quote 'PDFium' in DESCRIPTION title and description CRAN reviewer asked: "Please single quote software names in both Title and Description fields of the DESCRIPTION file such as 'PDFium'." Co-Authored-By: Claude Opus 4.7 (1M context) --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 954138e..b1aa410 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: pdfium -Title: Idiomatic R Bindings to the PDFium PDF Engine +Title: Idiomatic R Bindings to the 'PDFium' PDF Engine Version: 0.1.0 Authors@R: c( person("Bill", "Denney", , "wdenney@humanpredictions.com", @@ -9,7 +9,7 @@ Authors@R: c( comment = "Authors of bundled PDFium binaries (BSD-3-Clause)") ) Description: Read PDF documents at the level of pages, page objects, and path - geometry using Google's PDFium engine. Surfaces path segments, stroke and + geometry using Google's 'PDFium' engine. Surfaces path segments, stroke and fill style, transformation matrices, text positions and content, font metadata, image metadata, and page rendering. Complements 'pdftools' and 'qpdf' by exposing vector-path information no other R package surfaces. From 63d4f4c274901b2573699ea27d5d8f093f04242b Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 29 Jun 2026 09:00:56 -0400 Subject: [PATCH 07/10] build(cran): prefer existing libpdfium install before downloading CRAN reviewer asked: "Please only download if no system installation is found by configure." POSIX `configure` now selects libpdfium in this order: 1. PDFIUM_HOME env var pointing at an existing install (include/fpdfview.h + lib/libpdfium.{so,dylib}) 2. pkg-config --exists libpdfium 3. Standard prefixes: /usr/local, /usr, /opt/homebrew, /opt/local A system match writes src/Makevars pointing at the system paths without an RPATH (ld.so.cache / macOS fallback paths resolve libpdfium at load time). Only if nothing is found do we fall back to tools/download-pdfium.R. `configure.win` honors PDFIUM_HOME when it contains a pre-built Mingw-compatible distribution (include/, lib/libpdfium.dll.a, bin/libpdfium.dll); the files are staged into inst/ so the rest of the build pipeline is unchanged. Windows has no canonical system install location, so we do not guess. Pin shell scripts (configure, configure.win, *.sh) to LF in .gitattributes so a Windows checkout with core.autocrlf=true does not break sh on POSIX. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitattributes | 7 +++ configure | 165 ++++++++++++++++++++++++++++++++++++++----------- configure.win | 45 +++++++++++--- 3 files changed, 173 insertions(+), 44 deletions(-) diff --git a/.gitattributes b/.gitattributes index 45461d4..13ac0ac 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,3 +14,10 @@ # Upstream patches must stay byte-identical to what `git format-patch` # / `git apply` expect — no autocrlf. *.patch binary + +# Shell scripts that R CMD INSTALL executes must keep LF endings even +# on Windows checkouts, or `sh` rejects them with `\r: command not +# found`. This includes the package's configure scripts. +configure text eol=lf +configure.win text eol=lf +*.sh text eol=lf diff --git a/configure b/configure index c93e825..8009e52 100755 --- a/configure +++ b/configure @@ -1,22 +1,32 @@ #!/usr/bin/env sh # pdfium R package — POSIX configure script. # -# Run by `R CMD INSTALL`. Downloads the pinned bblanchon PDFium binary for -# the current platform, copies headers / libraries into inst/include and -# inst/lib, then writes src/Makevars with the resolved paths and a -# relocatable RPATH so the installed `pdfium.so` finds `libpdfium.so` at -# load time without LD_LIBRARY_PATH. +# Selects a libpdfium to build against in this order: +# 1. PDFIUM_HOME env var pointing at an existing install +# (with include/fpdfview.h and lib/libpdfium.{so,dylib}) +# 2. pkg-config --exists libpdfium +# 3. Standard system prefixes: /usr/local, /usr, /opt/homebrew, +# /opt/local +# 4. Fall back to downloading the pinned bblanchon binary via +# tools/download-pdfium.R (the existing behavior). +# +# When a system install is found, the generated src/Makevars points +# directly at the system paths without an RPATH — the dynamic +# loader's standard search path (ld.so.cache on Linux, the macOS +# fallback paths) resolves libpdfium at load time. +# +# When the binary is downloaded, the existing relocatable RPATH +# (`$ORIGIN/../lib` on Linux, `@loader_path/../lib` on macOS) lets +# the installed pdfium.so find libpdfium inside the package's +# inst/lib/ directory. # # Honors: # R_HOME, R picked up automatically from R CMD INSTALL -# PDFIUM_OFFLINE=1 skip download; require vendored archive under -# inst/pdfium-binaries/ -# PDFIUM_BINARY_URL override download URL (mirrors, internal hosts) +# PDFIUM_HOME system-install root (skip download) +# PDFIUM_OFFLINE=1 skip download; require vendored archive +# under inst/pdfium-binaries/ +# PDFIUM_BINARY_URL override download URL (mirrors) # PDFIUM_CACHE_DIR cross-build archive cache directory -# -# CRAN-builder note: CRAN allows configure-time downloads (precedent: arrow). -# Set PDFIUM_OFFLINE=1 and ship the archive inside the source tarball under -# inst/pdfium-binaries/ if that ever changes. set -eu @@ -31,35 +41,116 @@ if [ ! -x "$RSCRIPT" ]; then fi fi -echo "[pdfium configure] Resolving binary for $(uname -sm)" -RESOLVED="$("$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT")" -PDFIUM_INCLUDE="$(printf '%s\n' "$RESOLVED" | sed -n '1p')" -PDFIUM_LIB="$(printf '%s\n' "$RESOLVED" | sed -n '2p')" +PDFIUM_INCLUDE="" +PDFIUM_LIB="" +USE_SYSTEM="" + +# Helper: given a candidate include dir and lib dir, accept them if +# both libpdfium and the public header live there. +check_pair() { + inc="$1" + lib="$2" + if [ ! -f "$inc/fpdfview.h" ]; then + return 1 + fi + for ext in so dylib; do + if [ -f "$lib/libpdfium.$ext" ]; then + PDFIUM_INCLUDE="$inc" + PDFIUM_LIB="$lib" + return 0 + fi + done + return 1 +} -if [ -z "$PDFIUM_INCLUDE" ] || [ -z "$PDFIUM_LIB" ]; then - echo "ERROR: tools/download-pdfium.R did not return include / lib paths" >&2 - exit 1 +# 1. PDFIUM_HOME. +if [ -n "${PDFIUM_HOME:-}" ]; then + for inc in "$PDFIUM_HOME/include" "$PDFIUM_HOME/include/pdfium"; do + for lib in "$PDFIUM_HOME/lib" "$PDFIUM_HOME/lib64"; do + if check_pair "$inc" "$lib"; then + USE_SYSTEM=1 + break 2 + fi + done + done + if [ -z "$USE_SYSTEM" ]; then + echo "[pdfium configure] PDFIUM_HOME=$PDFIUM_HOME set but no usable" \ + "install found there; trying other locations" >&2 + fi +fi + +# 2. pkg-config. +if [ -z "$USE_SYSTEM" ] && command -v pkg-config >/dev/null 2>&1; then + if pkg-config --exists libpdfium 2>/dev/null; then + pc_inc="$(pkg-config --variable=includedir libpdfium 2>/dev/null || true)" + pc_lib="$(pkg-config --variable=libdir libpdfium 2>/dev/null || true)" + if [ -n "$pc_inc" ] && [ -n "$pc_lib" ]; then + for inc_cand in "$pc_inc" "$pc_inc/pdfium"; do + if check_pair "$inc_cand" "$pc_lib"; then + USE_SYSTEM=1 + break + fi + done + fi + fi fi -# RPATH so the installed pdfium.so finds libpdfium relative to itself. -# After install, libpdfium.{so,dylib} sits in /lib/ (from inst/lib) -# and pdfium.so sits in /libs//, so RPATH points one level -# up plus into lib/. +# 3. Standard prefixes. +if [ -z "$USE_SYSTEM" ]; then + for prefix in /usr/local /usr /opt/homebrew /opt/local; do + for inc in "$prefix/include" "$prefix/include/pdfium"; do + for lib in "$prefix/lib" "$prefix/lib64" \ + "$prefix/lib/x86_64-linux-gnu" \ + "$prefix/lib/aarch64-linux-gnu"; do + if check_pair "$inc" "$lib"; then + USE_SYSTEM=1 + break 3 + fi + done + done + done +fi + +# 4. Fall back to download. +if [ -z "$USE_SYSTEM" ]; then + echo "[pdfium configure] No system libpdfium found; downloading binary for $(uname -sm)" + RESOLVED="$("$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT")" + PDFIUM_INCLUDE="$(printf '%s\n' "$RESOLVED" | sed -n '1p')" + PDFIUM_LIB="$(printf '%s\n' "$RESOLVED" | sed -n '2p')" + if [ -z "$PDFIUM_INCLUDE" ] || [ -z "$PDFIUM_LIB" ]; then + echo "ERROR: tools/download-pdfium.R did not return include / lib paths" >&2 + exit 1 + fi +else + echo "[pdfium configure] Using system libpdfium (include=$PDFIUM_INCLUDE, lib=$PDFIUM_LIB)" +fi + +# RPATH so the installed pdfium.so finds the bundled libpdfium +# relative to itself. After install, libpdfium.{so,dylib} sits in +# /lib/ (from inst/lib) and pdfium.so sits in /libs//, +# so RPATH points one level up plus into lib/. # -# The dollar sign survives Make (which sees $$ORIGIN and produces $ORIGIN) -# and the recipe shell (which receives a single-quoted token from the -# heredoc, so $ORIGIN is not parameter-expanded). -case "$(uname -s)" in - Darwin) - RPATH_FLAG="-Wl,-rpath,'@loader_path/../lib'" - ;; - Linux) - RPATH_FLAG='-Wl,-rpath,'"'"'$$ORIGIN/../lib'"'" - ;; - *) - RPATH_FLAG="" - ;; -esac +# The dollar sign survives Make (which sees $$ORIGIN and produces +# $ORIGIN) and the recipe shell (which receives a single-quoted token +# from the heredoc, so $ORIGIN is not parameter-expanded). +# +# When using a system install, the library is on the dynamic loader's +# standard search path and no RPATH is needed. +if [ -n "$USE_SYSTEM" ]; then + RPATH_FLAG="" +else + case "$(uname -s)" in + Darwin) + RPATH_FLAG="-Wl,-rpath,'@loader_path/../lib'" + ;; + Linux) + RPATH_FLAG='-Wl,-rpath,'"'"'$$ORIGIN/../lib'"'" + ;; + *) + RPATH_FLAG="" + ;; + esac +fi cat > "$PKG_ROOT/src/Makevars" <&2 + fi +fi + +if [ -n "$USE_SYSTEM" ]; then + echo "[pdfium configure.win] Staging libpdfium from PDFIUM_HOME=$PDFIUM_HOME" + mkdir -p "$PKG_ROOT/inst/include" "$PKG_ROOT/inst/lib" "$PKG_ROOT/inst/bin" + cp -R "$PDFIUM_HOME/include/." "$PKG_ROOT/inst/include/" + cp "$PDFIUM_HOME/lib/libpdfium.dll.a" "$PKG_ROOT/inst/lib/" + cp "$PDFIUM_HOME/bin/libpdfium.dll" "$PKG_ROOT/inst/bin/" +else + echo "[pdfium configure.win] No system libpdfium found; downloading binary for Windows" + "$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT" +fi echo "[pdfium configure.win] inst/{include,lib,bin} populated." From 6b30a7bc2a69f7c1bb84eec14580b5b456798251 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 29 Jun 2026 09:14:15 -0400 Subject: [PATCH 08/10] build(cran): strip dead code; document residual exit/abort symbols CRAN reviewer asked: "If the exit/about calls are not needed, please omit them before compiling the dll." Added `-ffunction-sections -fdata-sections -Wl,--gc-sections` to src/Makevars.win so the linker discards unreferenced sections from the package's own compilation units. Verified the package still builds and loads after the change. The residual `_exit` / `abort` / `exit` imports cannot be removed at our layer. `objdump -p src/pdfium.dll` shows they are imports of `api-ms-win-crt-runtime-l1-1-0.dll` (the Windows Universal CRT), pulled in by Rtools' own static MinGW startup libraries: libmingw32.a references abort libmingwex.a references _exit libucrt.a references _exit Our own C/C++ source contains zero direct calls to these symbols (all error paths route through Rcpp::stop()); removing them would require rebuilding Rtools/MinGW itself. cran-comments.md spells this out with the supporting evidence for the next reviewer. Co-Authored-By: Claude Opus 4.7 (1M context) --- cran-comments.md | 53 ++++++++++++++++++++++++++++++++++-------------- src/Makevars.win | 3 ++- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index 8323387..b2b4c0f 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,27 +2,50 @@ ## Summary -`pdfium` provides idiomatic R bindings to Google's PDFium PDF +`pdfium` provides idiomatic R bindings to Google's 'PDFium' PDF engine via Rcpp. It exposes more of a PDF's internals — vector- path geometry, annotations, form fields, attachments, signatures, structure tree, named destinations, viewer preferences, and a focused mutation surface — than any other R package on CRAN today. -## NOTEs we cannot eliminate +## Resubmission — addressing prior reviewer feedback -* **"Possibly misspelled words in DESCRIPTION: PDFium"** — - "PDFium" is the proper name of the upstream Google library - this package wraps. It is already listed in `inst/WORDLIST`. +* **Single-quoted 'PDFium'** in the Title and Description + fields of DESCRIPTION. + +* **`configure` prefers an existing system libpdfium before + downloading.** Selection order, on POSIX: + + 1. PDFIUM_HOME env var pointing at a usable install + 2. pkg-config --exists libpdfium + 3. /usr/local, /usr, /opt/homebrew, /opt/local + + Only if none of those resolve does it fall back to the + bblanchon binary download. `configure.win` honors + PDFIUM_HOME similarly (Windows has no canonical system + install location, so it does not guess). + +## NOTE we cannot eliminate * **"Found '\_exit' / 'abort' / 'exit'" in `pdfium.dll`** — the - flagged DLL is the package's own compiled Rcpp library. Our - C/C++ source contains zero calls to these functions (all - error paths use `Rcpp::stop()`). The symbols are linked in by - the C/C++ runtime that Rtools statically attaches to every - shared library (`libgcc` / `libstdc++` reference `abort()` - from the default terminate handler) and via the import table - for the upstream `libpdfium.dll` we link against. CRAN's own - NOTE acknowledges this case: *"The detected symbols are - linked into the code but might come from libraries and not - actually be called."* + flagged DLL is the package's own compiled Rcpp library, not + the upstream binary. Our C/C++ source contains zero direct + calls to these functions (all error paths use `Rcpp::stop()`). + They are imports of `api-ms-win-crt-runtime-l1-1-0.dll` (the + Windows Universal CRT) pulled in by the MinGW-w64 startup + machinery that Rtools links into every shared object: + + libmingw32.a references abort (CRT init, + __cxa_terminate handler) + libmingwex.a references _exit + libucrt.a references _exit + + We added `-ffunction-sections -fdata-sections + -Wl,--gc-sections` to strip dead code from our own + compilation units, but the residual references survive + because they are needed by Rtools' own static runtime, not + by our code. Removing them would require rebuilding + Rtools/MinGW itself. CRAN's own NOTE text acknowledges this + case: *"The detected symbols are linked into the code but + might come from libraries and not actually be called."* diff --git a/src/Makevars.win b/src/Makevars.win index fbae01a..af40ca9 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -21,6 +21,7 @@ # automatically. PKG_CPPFLAGS = -I../inst/include -DR_NO_REMAP -PKG_LIBS = -L../inst/lib -lpdfium +PKG_CXXFLAGS = -ffunction-sections -fdata-sections +PKG_LIBS = -L../inst/lib -lpdfium -Wl,--gc-sections CXX_STD = CXX17 From 8747ac436890d0a8dfbc531046e283f40c38bdaf Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 29 Jun 2026 11:10:53 -0400 Subject: [PATCH 09/10] fix(cran): install symbols.rds so check_compiled_code sees clean .o files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous explanation in cran-comments.md was wrong. Empirical testing shows: * Our production .o files (R CMD INSTALL with -DNDEBUG, no -UNDEBUG) contain zero references to _exit / abort / exit. * Every Rtools/MinGW-built Windows DLL on this system imports _exit / abort / exit from the Universal CRT (verified across Rcpp, tibble, vctrs, cli, stringi, rlang, openssl, curl, ps, fs, digest). Those imports come from libgcc / libstdc++ / libmingw32.a runtime startup machinery; they are universal, not specific to this package. The real reason CRAN NOTEd us and not those other packages: tools::check_compiled_code() reads libs//symbols.rds when _R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE (which CRAN sets) and uses it to filter the DLL's symbols against what was actually in the .o files. Without symbols.rds, it falls back to scanning the DLL alone and trips on the runtime imports. R CMD INSTALL writes src/symbols.rds when that env var is set, and its default install logic copies it into libs//. This package ships an install.libs.R that replaces that default logic (to handle the bundled libpdfium copy) and did not propagate symbols.rds. Now it does. Verified: with the fix, tools:::check_compiled_code() returns NULL on the installed package both with and without _R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE. Also reverted the -ffunction-sections -fdata-sections -Wl,--gc-sections flags I added speculatively in the previous commit — they did not remove the runtime imports (the real fix is symbols.rds) and they actually grow pdfium.dll by ~27 KB on this toolchain because the per-section header overhead exceeds the dead-code savings. cran-comments.md is updated to reflect the correct diagnosis. Co-Authored-By: Claude Opus 4.7 (1M context) --- cran-comments.md | 49 +++++++++++++++++++++++----------------------- src/Makevars.win | 3 +-- src/install.libs.R | 16 +++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index b2b4c0f..ecaed96 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -15,7 +15,7 @@ package on CRAN today. fields of DESCRIPTION. * **`configure` prefers an existing system libpdfium before - downloading.** Selection order, on POSIX: + downloading.** On POSIX, the selection order is: 1. PDFIUM_HOME env var pointing at a usable install 2. pkg-config --exists libpdfium @@ -26,26 +26,27 @@ package on CRAN today. PDFIUM_HOME similarly (Windows has no canonical system install location, so it does not guess). -## NOTE we cannot eliminate - -* **"Found '\_exit' / 'abort' / 'exit'" in `pdfium.dll`** — the - flagged DLL is the package's own compiled Rcpp library, not - the upstream binary. Our C/C++ source contains zero direct - calls to these functions (all error paths use `Rcpp::stop()`). - They are imports of `api-ms-win-crt-runtime-l1-1-0.dll` (the - Windows Universal CRT) pulled in by the MinGW-w64 startup - machinery that Rtools links into every shared object: - - libmingw32.a references abort (CRT init, - __cxa_terminate handler) - libmingwex.a references _exit - libucrt.a references _exit - - We added `-ffunction-sections -fdata-sections - -Wl,--gc-sections` to strip dead code from our own - compilation units, but the residual references survive - because they are needed by Rtools' own static runtime, not - by our code. Removing them would require rebuilding - Rtools/MinGW itself. CRAN's own NOTE text acknowledges this - case: *"The detected symbols are linked into the code but - might come from libraries and not actually be called."* +* **`_exit` / `abort` / `exit` symbols** — root cause + identified and fixed. `tools::check_compiled_code()` on + Windows reads `libs//symbols.rds` (an R-generated + per-`.o`-file symbol table) when `_R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE` + (which CRAN sets). Without that file installed alongside + the package's DLL, the check falls back to scanning the + DLL's import table — which on every Rtools/MinGW-built + Windows shared library imports `_exit`/`abort`/`exit` from + the Universal CRT (libgcc / libstdc++ / libmingw32.a all + reference them from their runtime/terminate machinery, + whether or not the user's code calls them). The package's + own compiled `.o` files contain zero references to these + symbols (verified with `nm --undefined-only` on a + production build). + + This package ships an `install.libs.R` script that replaces + R's default install logic for `src/*.so/.dll`. The previous + version did not propagate `src/symbols.rds` into the + installed `libs//`. The fix in this submission copies + `symbols.rds` when present, restoring the behaviour R + performs by default for packages without a custom + `install.libs.R`. With `symbols.rds` in place, + `tools::check_compiled_code()` returns no findings on the + installed package. diff --git a/src/Makevars.win b/src/Makevars.win index af40ca9..fbae01a 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -21,7 +21,6 @@ # automatically. PKG_CPPFLAGS = -I../inst/include -DR_NO_REMAP -PKG_CXXFLAGS = -ffunction-sections -fdata-sections -PKG_LIBS = -L../inst/lib -lpdfium -Wl,--gc-sections +PKG_LIBS = -L../inst/lib -lpdfium CXX_STD = CXX17 diff --git a/src/install.libs.R b/src/install.libs.R index 229a126..cd3cb3a 100644 --- a/src/install.libs.R +++ b/src/install.libs.R @@ -44,6 +44,22 @@ local({ shlib_src, dest)) } + # 1b. Copy the per-object symbol table that R CMD INSTALL writes + # when _R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE (the setting CRAN + # uses). Without symbols.rds in the installed libs//, the + # check_compiled_code() pass on Windows can't see what was in our + # .o files and falls back to scanning the .dll's import table — + # which always contains _exit/abort/exit from the MinGW static + # runtime, producing a spurious NOTE. Default R install logic + # would copy this automatically; we replicate it here because + # this script replaces that default. + for (sym_src in c("symbols.rds", file.path(paste0("src", R_ARCH), "symbols.rds"))) { + if (file.exists(sym_src)) { + file.copy(sym_src, file.path(dest, "symbols.rds"), overwrite = TRUE) + break + } + } + # 2. Windows: copy bblanchon's libpdfium.dll next to our DLL. if (.Platform$OS.type == "windows") { pkg_root <- R_PACKAGE_SOURCE From 2545f9f100a8b23f41ed6693ef24cf45a83fe00e Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 29 Jun 2026 11:12:25 -0400 Subject: [PATCH 10/10] docs: describe libpdfium selection order (PDFIUM_HOME, pkg-config, ...) README's Installation section and ?pdfium-package now spell out the order configure uses to pick a libpdfium: 1. PDFIUM_HOME env var pointing at an existing install 2. pkg-config --exists libpdfium (POSIX only) 3. /usr/local, /usr, /opt/homebrew, /opt/local (POSIX only) 4. Download from bblanchon/pdfium-binaries (offline opt-out via PDFIUM_OFFLINE=1 with a vendored archive) The README lists the required file layout under $PDFIUM_HOME per platform so users with a hand-built or vendored libpdfium can wire it in without guessing. Co-Authored-By: Claude Opus 4.7 (1M context) --- R/pdfium-package.R | 20 +++++-- README.Rmd | 47 +++++++++++++--- README.md | 128 ++++++++++++++++++++++++++---------------- man/pdfium-package.Rd | 21 +++++-- 4 files changed, 154 insertions(+), 62 deletions(-) diff --git a/R/pdfium-package.R b/R/pdfium-package.R index 03249c9..e5eb467 100644 --- a/R/pdfium-package.R +++ b/R/pdfium-package.R @@ -13,10 +13,22 @@ #' #' @section Binary distribution: #' -#' The underlying `libpdfium` shared library is downloaded from -#' [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -#' the first time the package is installed. The pinned version lives in -#' `tools/pdfium-version.txt`. +#' At install time, the `configure` script picks a `libpdfium` to +#' build against, in this order: +#' +#' 1. The `PDFIUM_HOME` environment variable, if it points at a +#' directory containing `include/fpdfview.h` and a +#' `libpdfium` shared library (`lib/libpdfium.{so,dylib}` on +#' POSIX, or `lib/libpdfium.dll.a` + `bin/libpdfium.dll` on +#' Windows). +#' 2. `pkg-config --exists libpdfium` (POSIX only). +#' 3. Standard system prefixes: `/usr/local`, `/usr`, +#' `/opt/homebrew`, `/opt/local` (POSIX only). +#' 4. Download from +#' [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries). +#' The pinned release lives in `tools/pdfium-version.txt`. +#' Set `PDFIUM_OFFLINE=1` and stage the tarball under +#' `inst/pdfium-binaries/` for offline installs. #' #' @keywords internal #' @name pdfium-package diff --git a/README.Rmd b/README.Rmd index 5aae7de..59a4ee1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -70,13 +70,6 @@ under `dev/decisions/`. ## Installation -`pdfium` downloads its `libpdfium` binary from -[bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -at install time. The pinned version lives in -`tools/pdfium-version.txt`. If your install runs without internet -access, set `PDFIUM_OFFLINE=1` and place the matching tarball under -`inst/pdfium-binaries/` before installing. - ```r # Release version (once on CRAN): install.packages("pdfium") @@ -85,6 +78,46 @@ install.packages("pdfium") remotes::install_github("humanpred/rpdfium") ``` +### Where the `libpdfium` binary comes from + +At install time, the `configure` script picks a `libpdfium` to +build against, in this order: + +1. **`PDFIUM_HOME`** — if this environment variable is set and + points at an existing install, that install is used. The + directory must contain headers and the shared library in the + conventional layout: + + | Platform | Required files under `$PDFIUM_HOME` | + |------------|----------------------------------------------------------------------| + | Linux | `include/fpdfview.h` and `lib/libpdfium.so` (or `lib64/`) | + | macOS | `include/fpdfview.h` and `lib/libpdfium.dylib` | + | Windows | `include/fpdfview.h`, `lib/libpdfium.dll.a`, `bin/libpdfium.dll` | + + Useful when you have a hand-built PDFium, a vendored copy, + or a CI artefact you want to pin against. + +2. **`pkg-config --exists libpdfium`** *(POSIX only)* — if a + `libpdfium.pc` is on the `pkg-config` search path, the + reported `includedir` / `libdir` are used. + +3. **Standard system prefixes** *(POSIX only)* — `/usr/local`, + `/usr`, `/opt/homebrew`, `/opt/local`. The first one + containing both `include/fpdfview.h` and a `libpdfium` + shared library wins. + +4. **Download from + [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries)** + — the pinned release lives in `tools/pdfium-version.txt`. If + your install runs without internet access, set + `PDFIUM_OFFLINE=1` and place the matching tarball under + `inst/pdfium-binaries/` before installing. + +When a system install is found, no download happens and no +`libpdfium` is bundled into the installed package — your +existing copy resolves at load time via the platform's normal +shared-library search path. + ## Example ```{r example, eval = FALSE} diff --git a/README.md b/README.md index e648a64..a77cdca 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,54 @@ +--- +output: github_document +--- + + # pdfium - [![R-CMD-check](https://github.com/humanpred/rpdfium/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/humanpred/rpdfium/actions/workflows/R-CMD-check.yaml) -[![Codecov test -coverage](https://codecov.io/gh/humanpred/rpdfium/branch/main/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) -[![Lifecycle: -experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) -[![CRAN -status](https://www.r-pkg.org/badges/version/pdfium)](https://CRAN.R-project.org/package=pdfium) -[![Codecov test -coverage](https://codecov.io/gh/humanpred/rpdfium/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) +[![Codecov test coverage](https://codecov.io/gh/humanpred/rpdfium/branch/main/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) +[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) +[![CRAN status](https://www.r-pkg.org/badges/version/pdfium)](https://CRAN.R-project.org/package=pdfium) +[![Codecov test coverage](https://codecov.io/gh/humanpred/rpdfium/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) -`pdfium` provides idiomatic R bindings to [Google’s PDFium -engine](https://pdfium.googlesource.com/pdfium/) — the same library that -powers Chrome’s PDF viewer. It has two halves: +`pdfium` provides idiomatic R bindings to +[Google's PDFium engine](https://pdfium.googlesource.com/pdfium/) — the +same library that powers Chrome's PDF viewer. It has two halves: -- a **read surface** that exposes vector-path geometry — stroke / fill / - Bezier control points / transformation matrices — alongside text, - fonts, images, annotations, form fields, attachments, signatures, - structure tree, and rendering. The path geometry, in particular, no - other CRAN package surfaces today. -- a **mutation surface** (opt-in via `readwrite = TRUE`) that lets you - rotate / reorder / merge pages, draw fresh page objects, create and - edit annotations, fill form fields, and add file attachments — then - save the result. +* a **read surface** that exposes vector-path geometry — + stroke / fill / Bezier control points / transformation matrices — + alongside text, fonts, images, annotations, form fields, + attachments, signatures, structure tree, and rendering. The path + geometry, in particular, no other CRAN package surfaces today. +* a **mutation surface** (opt-in via `readwrite = TRUE`) that lets + you rotate / reorder / merge pages, draw fresh page objects, + create and edit annotations, fill form fields, and add file + attachments — then save the result. ## What it is for -- **Auditing** PDF figures (which lines, which colors, which fonts). -- **Extracting** curves from regulatory filings and scientific +* **Auditing** PDF figures (which lines, which colors, which fonts). +* **Extracting** curves from regulatory filings and scientific publications. -- **Building** PDF normalization pipelines that need geometry, not just - text. -- **Filling** AcroForm fields programmatically and flattening the result - for downstream tooling. -- **Authoring** programmatic PDFs from vector graphics, JPEG images, - text in the 14 standard fonts or any TrueType / Type1 typeface, and - annotations (think: figure callouts, table reports, annotated source - documents). `/Info`-dict writes and on-save encryption are the - remaining v0.1.0 gaps — both need upstream PDFium changes that we’ve - proposed but Google hasn’t shipped yet. -- Anything you’d otherwise drop into Python with `pypdfium2`. - -See -[`vignette("mutating-pdfs")`](https://humanpred.github.io/rpdfium/articles/mutating-pdfs.html) +* **Building** PDF normalization pipelines that need geometry, not + just text. +* **Filling** AcroForm fields programmatically and flattening the + result for downstream tooling. +* **Authoring** programmatic PDFs from vector graphics, JPEG + images, text in the 14 standard fonts or any TrueType / Type1 + typeface, and annotations (think: figure callouts, table + reports, annotated source documents). `/Info`-dict writes and + on-save encryption are the remaining v0.1.0 gaps — both need + upstream PDFium changes that we've proposed but Google hasn't + shipped yet. +* Anything you'd otherwise drop into Python with `pypdfium2`. + +See [`vignette("mutating-pdfs")`](https://humanpred.github.io/rpdfium/articles/mutating-pdfs.html) for a walkthrough of the writer surface, and [`vignette("comparison")`](https://humanpred.github.io/rpdfium/articles/comparison.html) for how `pdfium` lines up against `pdftools`, `qpdf`, `magick`, @@ -63,14 +63,7 @@ under `dev/decisions/`. ## Installation -`pdfium` downloads its `libpdfium` binary from -[bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -at install time. The pinned version lives in `tools/pdfium-version.txt`. -If your install runs without internet access, set `PDFIUM_OFFLINE=1` and -place the matching tarball under `inst/pdfium-binaries/` before -installing. - -``` r +```r # Release version (once on CRAN): install.packages("pdfium") @@ -78,8 +71,49 @@ install.packages("pdfium") remotes::install_github("humanpred/rpdfium") ``` +### Where the `libpdfium` binary comes from + +At install time, the `configure` script picks a `libpdfium` to +build against, in this order: + +1. **`PDFIUM_HOME`** — if this environment variable is set and + points at an existing install, that install is used. The + directory must contain headers and the shared library in the + conventional layout: + + | Platform | Required files under `$PDFIUM_HOME` | + |------------|----------------------------------------------------------------------| + | Linux | `include/fpdfview.h` and `lib/libpdfium.so` (or `lib64/`) | + | macOS | `include/fpdfview.h` and `lib/libpdfium.dylib` | + | Windows | `include/fpdfview.h`, `lib/libpdfium.dll.a`, `bin/libpdfium.dll` | + + Useful when you have a hand-built PDFium, a vendored copy, + or a CI artefact you want to pin against. + +2. **`pkg-config --exists libpdfium`** *(POSIX only)* — if a + `libpdfium.pc` is on the `pkg-config` search path, the + reported `includedir` / `libdir` are used. + +3. **Standard system prefixes** *(POSIX only)* — `/usr/local`, + `/usr`, `/opt/homebrew`, `/opt/local`. The first one + containing both `include/fpdfview.h` and a `libpdfium` + shared library wins. + +4. **Download from + [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries)** + — the pinned release lives in `tools/pdfium-version.txt`. If + your install runs without internet access, set + `PDFIUM_OFFLINE=1` and place the matching tarball under + `inst/pdfium-binaries/` before installing. + +When a system install is found, no download happens and no +`libpdfium` is bundled into the installed package — your +existing copy resolves at load time via the platform's normal +shared-library search path. + ## Example + ``` r library(pdfium) @@ -90,8 +124,8 @@ pdf_page_count(doc) pdf_doc_close(doc) ``` -More examples ship in the vignettes -(`vignette("getting-started", package = "pdfium")`, etc.) and on the +More examples ship in the vignettes (`vignette("getting-started", +package = "pdfium")`, etc.) and on the [pkgdown site](https://humanpred.github.io/rpdfium/). ## License diff --git a/man/pdfium-package.Rd b/man/pdfium-package.Rd index 4f137c8..a4be235 100644 --- a/man/pdfium-package.Rd +++ b/man/pdfium-package.Rd @@ -22,10 +22,23 @@ rendering) arrive in subsequent releases. \section{Binary distribution}{ -The underlying \code{libpdfium} shared library is downloaded from -\href{https://github.com/bblanchon/pdfium-binaries}{bblanchon/pdfium-binaries} -the first time the package is installed. The pinned version lives in -\code{tools/pdfium-version.txt}. +At install time, the \code{configure} script picks a \code{libpdfium} to +build against, in this order: +\enumerate{ +\item The \code{PDFIUM_HOME} environment variable, if it points at a +directory containing \code{include/fpdfview.h} and a +\code{libpdfium} shared library (\verb{lib/libpdfium.\{so,dylib\}} on +POSIX, or \code{lib/libpdfium.dll.a} + \code{bin/libpdfium.dll} on +Windows). +\item \verb{pkg-config --exists libpdfium} (POSIX only). +\item Standard system prefixes: \verb{/usr/local}, \verb{/usr}, +\verb{/opt/homebrew}, \verb{/opt/local} (POSIX only). +\item Download from +\href{https://github.com/bblanchon/pdfium-binaries}{bblanchon/pdfium-binaries}. +The pinned release lives in \code{tools/pdfium-version.txt}. +Set \code{PDFIUM_OFFLINE=1} and stage the tarball under +\verb{inst/pdfium-binaries/} for offline installs. +} } \seealso{