diff --git a/.Rbuildignore b/.Rbuildignore index 43ba791..fb2ca94 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -26,3 +26,4 @@ ^pkgdown$ ^doc$ ^Meta$ +^CRAN-SUBMISSION$ diff --git a/.gitattributes b/.gitattributes index 45461d4..13ac0ac 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,3 +14,10 @@ # Upstream patches must stay byte-identical to what `git format-patch` # / `git apply` expect — no autocrlf. *.patch binary + +# Shell scripts that R CMD INSTALL executes must keep LF endings even +# on Windows checkouts, or `sh` rejects them with `\r: command not +# found`. This includes the package's configure scripts. +configure text eol=lf +configure.win text eol=lf +*.sh text eol=lf diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..844f115 --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 0.1.0 +Date: 2026-05-31 21:42:49 UTC +SHA: e121bc96335cd429524d5f53d1bf584856a9ef1b diff --git a/DESCRIPTION b/DESCRIPTION index 954138e..b1aa410 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: pdfium -Title: Idiomatic R Bindings to the PDFium PDF Engine +Title: Idiomatic R Bindings to the 'PDFium' PDF Engine Version: 0.1.0 Authors@R: c( person("Bill", "Denney", , "wdenney@humanpredictions.com", @@ -9,7 +9,7 @@ Authors@R: c( comment = "Authors of bundled PDFium binaries (BSD-3-Clause)") ) Description: Read PDF documents at the level of pages, page objects, and path - geometry using Google's PDFium engine. Surfaces path segments, stroke and + geometry using Google's 'PDFium' engine. Surfaces path segments, stroke and fill style, transformation matrices, text positions and content, font metadata, image metadata, and page rendering. Complements 'pdftools' and 'qpdf' by exposing vector-path information no other R package surfaces. diff --git a/R/api_completion.R b/R/api_completion.R index ecd9804..b161616 100644 --- a/R/api_completion.R +++ b/R/api_completion.R @@ -105,7 +105,7 @@ pdf_page_has_transparency <- function(page) { cpp_page_has_transparency(page$ptr) } -#' Page bounding box (cropbox ∩ mediabox) +#' Page bounding box (cropbox intersect mediabox) #' #' Wraps `FPDF_GetPageBoundingBox` — returns the rectangle that #' encloses the visible portion of `page` after intersecting the diff --git a/R/form_fields.R b/R/form_fields.R index 879c0f6..dd0e3b8 100644 --- a/R/form_fields.R +++ b/R/form_fields.R @@ -80,7 +80,7 @@ form_field_flag_decode <- function(flags, bit) { #' `TRUE` / `FALSE` for `checkbox` / `radiobutton` fields, #' `NA` for every other field type. #' * `control_count` integer - total number of widgets in this -#' field's control group (≥ 1; `> 1` for radio button groups +#' field's control group (`>= 1`; `> 1` for radio button groups #' with multiple physical widgets). `NA` if PDFium reports #' failure. #' * `control_index` integer - 0-based position of this row's diff --git a/R/pdfium-package.R b/R/pdfium-package.R index 03249c9..e5eb467 100644 --- a/R/pdfium-package.R +++ b/R/pdfium-package.R @@ -13,10 +13,22 @@ #' #' @section Binary distribution: #' -#' The underlying `libpdfium` shared library is downloaded from -#' [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -#' the first time the package is installed. The pinned version lives in -#' `tools/pdfium-version.txt`. +#' At install time, the `configure` script picks a `libpdfium` to +#' build against, in this order: +#' +#' 1. The `PDFIUM_HOME` environment variable, if it points at a +#' directory containing `include/fpdfview.h` and a +#' `libpdfium` shared library (`lib/libpdfium.{so,dylib}` on +#' POSIX, or `lib/libpdfium.dll.a` + `bin/libpdfium.dll` on +#' Windows). +#' 2. `pkg-config --exists libpdfium` (POSIX only). +#' 3. Standard system prefixes: `/usr/local`, `/usr`, +#' `/opt/homebrew`, `/opt/local` (POSIX only). +#' 4. Download from +#' [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries). +#' The pinned release lives in `tools/pdfium-version.txt`. +#' Set `PDFIUM_OFFLINE=1` and stage the tarball under +#' `inst/pdfium-binaries/` for offline installs. #' #' @keywords internal #' @name pdfium-package diff --git a/README.Rmd b/README.Rmd index a49b6bb..59a4ee1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -70,13 +70,6 @@ under `dev/decisions/`. ## Installation -`pdfium` downloads its `libpdfium` binary from -[bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -at install time. The pinned version lives in -`tools/pdfium-version.txt`. If your install runs without internet -access, set `PDFIUM_OFFLINE=1` and place the matching tarball under -`inst/pdfium-binaries/` before installing. - ```r # Release version (once on CRAN): install.packages("pdfium") @@ -85,6 +78,46 @@ install.packages("pdfium") remotes::install_github("humanpred/rpdfium") ``` +### Where the `libpdfium` binary comes from + +At install time, the `configure` script picks a `libpdfium` to +build against, in this order: + +1. **`PDFIUM_HOME`** — if this environment variable is set and + points at an existing install, that install is used. The + directory must contain headers and the shared library in the + conventional layout: + + | Platform | Required files under `$PDFIUM_HOME` | + |------------|----------------------------------------------------------------------| + | Linux | `include/fpdfview.h` and `lib/libpdfium.so` (or `lib64/`) | + | macOS | `include/fpdfview.h` and `lib/libpdfium.dylib` | + | Windows | `include/fpdfview.h`, `lib/libpdfium.dll.a`, `bin/libpdfium.dll` | + + Useful when you have a hand-built PDFium, a vendored copy, + or a CI artefact you want to pin against. + +2. **`pkg-config --exists libpdfium`** *(POSIX only)* — if a + `libpdfium.pc` is on the `pkg-config` search path, the + reported `includedir` / `libdir` are used. + +3. **Standard system prefixes** *(POSIX only)* — `/usr/local`, + `/usr`, `/opt/homebrew`, `/opt/local`. The first one + containing both `include/fpdfview.h` and a `libpdfium` + shared library wins. + +4. **Download from + [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries)** + — the pinned release lives in `tools/pdfium-version.txt`. If + your install runs without internet access, set + `PDFIUM_OFFLINE=1` and place the matching tarball under + `inst/pdfium-binaries/` before installing. + +When a system install is found, no download happens and no +`libpdfium` is bundled into the installed package — your +existing copy resolves at load time via the platform's normal +shared-library search path. + ## Example ```{r example, eval = FALSE} @@ -105,5 +138,6 @@ package = "pdfium")`, etc.) and on the `pdfium` is MIT-licensed. The bundled `libpdfium` binary is BSD-3-Clause and is *not* distributed in the source tarball — see -[`LICENSE.md`](LICENSE.md) and -[`dev/decisions/ADR-003-binary-distribution.md`](dev/decisions/ADR-003-binary-distribution.md). +[`LICENSE.md`](https://github.com/humanpred/rpdfium/blob/main/LICENSE.md) +and +[`dev/decisions/ADR-003-binary-distribution.md`](https://github.com/humanpred/rpdfium/blob/main/dev/decisions/ADR-003-binary-distribution.md). diff --git a/README.md b/README.md index fd28b09..a77cdca 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,54 @@ +--- +output: github_document +--- + + # pdfium - [![R-CMD-check](https://github.com/humanpred/rpdfium/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/humanpred/rpdfium/actions/workflows/R-CMD-check.yaml) -[![Codecov test -coverage](https://codecov.io/gh/humanpred/rpdfium/branch/main/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) -[![Lifecycle: -experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) -[![CRAN -status](https://www.r-pkg.org/badges/version/pdfium)](https://CRAN.R-project.org/package=pdfium) -[![Codecov test -coverage](https://codecov.io/gh/humanpred/rpdfium/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) +[![Codecov test coverage](https://codecov.io/gh/humanpred/rpdfium/branch/main/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) +[![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) +[![CRAN status](https://www.r-pkg.org/badges/version/pdfium)](https://CRAN.R-project.org/package=pdfium) +[![Codecov test coverage](https://codecov.io/gh/humanpred/rpdfium/graph/badge.svg)](https://app.codecov.io/gh/humanpred/rpdfium) -`pdfium` provides idiomatic R bindings to [Google’s PDFium -engine](https://pdfium.googlesource.com/pdfium/) — the same library that -powers Chrome’s PDF viewer. It has two halves: +`pdfium` provides idiomatic R bindings to +[Google's PDFium engine](https://pdfium.googlesource.com/pdfium/) — the +same library that powers Chrome's PDF viewer. It has two halves: -- a **read surface** that exposes vector-path geometry — stroke / fill / - Bezier control points / transformation matrices — alongside text, - fonts, images, annotations, form fields, attachments, signatures, - structure tree, and rendering. The path geometry, in particular, no - other CRAN package surfaces today. -- a **mutation surface** (opt-in via `readwrite = TRUE`) that lets you - rotate / reorder / merge pages, draw fresh page objects, create and - edit annotations, fill form fields, and add file attachments — then - save the result. +* a **read surface** that exposes vector-path geometry — + stroke / fill / Bezier control points / transformation matrices — + alongside text, fonts, images, annotations, form fields, + attachments, signatures, structure tree, and rendering. The path + geometry, in particular, no other CRAN package surfaces today. +* a **mutation surface** (opt-in via `readwrite = TRUE`) that lets + you rotate / reorder / merge pages, draw fresh page objects, + create and edit annotations, fill form fields, and add file + attachments — then save the result. ## What it is for -- **Auditing** PDF figures (which lines, which colors, which fonts). -- **Extracting** curves from regulatory filings and scientific +* **Auditing** PDF figures (which lines, which colors, which fonts). +* **Extracting** curves from regulatory filings and scientific publications. -- **Building** PDF normalization pipelines that need geometry, not just - text. -- **Filling** AcroForm fields programmatically and flattening the result - for downstream tooling. -- **Authoring** programmatic PDFs from vector graphics, JPEG images, - text in the 14 standard fonts or any TrueType / Type1 typeface, and - annotations (think: figure callouts, table reports, annotated source - documents). `/Info`-dict writes and on-save encryption are the - remaining v0.1.0 gaps — both need upstream PDFium changes that we’ve - proposed but Google hasn’t shipped yet. -- Anything you’d otherwise drop into Python with `pypdfium2`. - -See -[`vignette("mutating-pdfs")`](https://humanpred.github.io/rpdfium/articles/mutating-pdfs.html) +* **Building** PDF normalization pipelines that need geometry, not + just text. +* **Filling** AcroForm fields programmatically and flattening the + result for downstream tooling. +* **Authoring** programmatic PDFs from vector graphics, JPEG + images, text in the 14 standard fonts or any TrueType / Type1 + typeface, and annotations (think: figure callouts, table + reports, annotated source documents). `/Info`-dict writes and + on-save encryption are the remaining v0.1.0 gaps — both need + upstream PDFium changes that we've proposed but Google hasn't + shipped yet. +* Anything you'd otherwise drop into Python with `pypdfium2`. + +See [`vignette("mutating-pdfs")`](https://humanpred.github.io/rpdfium/articles/mutating-pdfs.html) for a walkthrough of the writer surface, and [`vignette("comparison")`](https://humanpred.github.io/rpdfium/articles/comparison.html) for how `pdfium` lines up against `pdftools`, `qpdf`, `magick`, @@ -63,14 +63,7 @@ under `dev/decisions/`. ## Installation -`pdfium` downloads its `libpdfium` binary from -[bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries) -at install time. The pinned version lives in `tools/pdfium-version.txt`. -If your install runs without internet access, set `PDFIUM_OFFLINE=1` and -place the matching tarball under `inst/pdfium-binaries/` before -installing. - -``` r +```r # Release version (once on CRAN): install.packages("pdfium") @@ -78,8 +71,49 @@ install.packages("pdfium") remotes::install_github("humanpred/rpdfium") ``` +### Where the `libpdfium` binary comes from + +At install time, the `configure` script picks a `libpdfium` to +build against, in this order: + +1. **`PDFIUM_HOME`** — if this environment variable is set and + points at an existing install, that install is used. The + directory must contain headers and the shared library in the + conventional layout: + + | Platform | Required files under `$PDFIUM_HOME` | + |------------|----------------------------------------------------------------------| + | Linux | `include/fpdfview.h` and `lib/libpdfium.so` (or `lib64/`) | + | macOS | `include/fpdfview.h` and `lib/libpdfium.dylib` | + | Windows | `include/fpdfview.h`, `lib/libpdfium.dll.a`, `bin/libpdfium.dll` | + + Useful when you have a hand-built PDFium, a vendored copy, + or a CI artefact you want to pin against. + +2. **`pkg-config --exists libpdfium`** *(POSIX only)* — if a + `libpdfium.pc` is on the `pkg-config` search path, the + reported `includedir` / `libdir` are used. + +3. **Standard system prefixes** *(POSIX only)* — `/usr/local`, + `/usr`, `/opt/homebrew`, `/opt/local`. The first one + containing both `include/fpdfview.h` and a `libpdfium` + shared library wins. + +4. **Download from + [bblanchon/pdfium-binaries](https://github.com/bblanchon/pdfium-binaries)** + — the pinned release lives in `tools/pdfium-version.txt`. If + your install runs without internet access, set + `PDFIUM_OFFLINE=1` and place the matching tarball under + `inst/pdfium-binaries/` before installing. + +When a system install is found, no download happens and no +`libpdfium` is bundled into the installed package — your +existing copy resolves at load time via the platform's normal +shared-library search path. + ## Example + ``` r library(pdfium) @@ -90,13 +124,14 @@ pdf_page_count(doc) pdf_doc_close(doc) ``` -More examples ship in the vignettes -(`vignette("getting-started", package = "pdfium")`, etc.) and on the +More examples ship in the vignettes (`vignette("getting-started", +package = "pdfium")`, etc.) and on the [pkgdown site](https://humanpred.github.io/rpdfium/). ## License `pdfium` is MIT-licensed. The bundled `libpdfium` binary is BSD-3-Clause and is *not* distributed in the source tarball — see -[`LICENSE.md`](LICENSE.md) and -[`dev/decisions/ADR-003-binary-distribution.md`](dev/decisions/ADR-003-binary-distribution.md). +[`LICENSE.md`](https://github.com/humanpred/rpdfium/blob/main/LICENSE.md) +and +[`dev/decisions/ADR-003-binary-distribution.md`](https://github.com/humanpred/rpdfium/blob/main/dev/decisions/ADR-003-binary-distribution.md). diff --git a/configure b/configure index c93e825..8009e52 100755 --- a/configure +++ b/configure @@ -1,22 +1,32 @@ #!/usr/bin/env sh # pdfium R package — POSIX configure script. # -# Run by `R CMD INSTALL`. Downloads the pinned bblanchon PDFium binary for -# the current platform, copies headers / libraries into inst/include and -# inst/lib, then writes src/Makevars with the resolved paths and a -# relocatable RPATH so the installed `pdfium.so` finds `libpdfium.so` at -# load time without LD_LIBRARY_PATH. +# Selects a libpdfium to build against in this order: +# 1. PDFIUM_HOME env var pointing at an existing install +# (with include/fpdfview.h and lib/libpdfium.{so,dylib}) +# 2. pkg-config --exists libpdfium +# 3. Standard system prefixes: /usr/local, /usr, /opt/homebrew, +# /opt/local +# 4. Fall back to downloading the pinned bblanchon binary via +# tools/download-pdfium.R (the existing behavior). +# +# When a system install is found, the generated src/Makevars points +# directly at the system paths without an RPATH — the dynamic +# loader's standard search path (ld.so.cache on Linux, the macOS +# fallback paths) resolves libpdfium at load time. +# +# When the binary is downloaded, the existing relocatable RPATH +# (`$ORIGIN/../lib` on Linux, `@loader_path/../lib` on macOS) lets +# the installed pdfium.so find libpdfium inside the package's +# inst/lib/ directory. # # Honors: # R_HOME, R picked up automatically from R CMD INSTALL -# PDFIUM_OFFLINE=1 skip download; require vendored archive under -# inst/pdfium-binaries/ -# PDFIUM_BINARY_URL override download URL (mirrors, internal hosts) +# PDFIUM_HOME system-install root (skip download) +# PDFIUM_OFFLINE=1 skip download; require vendored archive +# under inst/pdfium-binaries/ +# PDFIUM_BINARY_URL override download URL (mirrors) # PDFIUM_CACHE_DIR cross-build archive cache directory -# -# CRAN-builder note: CRAN allows configure-time downloads (precedent: arrow). -# Set PDFIUM_OFFLINE=1 and ship the archive inside the source tarball under -# inst/pdfium-binaries/ if that ever changes. set -eu @@ -31,35 +41,116 @@ if [ ! -x "$RSCRIPT" ]; then fi fi -echo "[pdfium configure] Resolving binary for $(uname -sm)" -RESOLVED="$("$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT")" -PDFIUM_INCLUDE="$(printf '%s\n' "$RESOLVED" | sed -n '1p')" -PDFIUM_LIB="$(printf '%s\n' "$RESOLVED" | sed -n '2p')" +PDFIUM_INCLUDE="" +PDFIUM_LIB="" +USE_SYSTEM="" + +# Helper: given a candidate include dir and lib dir, accept them if +# both libpdfium and the public header live there. +check_pair() { + inc="$1" + lib="$2" + if [ ! -f "$inc/fpdfview.h" ]; then + return 1 + fi + for ext in so dylib; do + if [ -f "$lib/libpdfium.$ext" ]; then + PDFIUM_INCLUDE="$inc" + PDFIUM_LIB="$lib" + return 0 + fi + done + return 1 +} -if [ -z "$PDFIUM_INCLUDE" ] || [ -z "$PDFIUM_LIB" ]; then - echo "ERROR: tools/download-pdfium.R did not return include / lib paths" >&2 - exit 1 +# 1. PDFIUM_HOME. +if [ -n "${PDFIUM_HOME:-}" ]; then + for inc in "$PDFIUM_HOME/include" "$PDFIUM_HOME/include/pdfium"; do + for lib in "$PDFIUM_HOME/lib" "$PDFIUM_HOME/lib64"; do + if check_pair "$inc" "$lib"; then + USE_SYSTEM=1 + break 2 + fi + done + done + if [ -z "$USE_SYSTEM" ]; then + echo "[pdfium configure] PDFIUM_HOME=$PDFIUM_HOME set but no usable" \ + "install found there; trying other locations" >&2 + fi +fi + +# 2. pkg-config. +if [ -z "$USE_SYSTEM" ] && command -v pkg-config >/dev/null 2>&1; then + if pkg-config --exists libpdfium 2>/dev/null; then + pc_inc="$(pkg-config --variable=includedir libpdfium 2>/dev/null || true)" + pc_lib="$(pkg-config --variable=libdir libpdfium 2>/dev/null || true)" + if [ -n "$pc_inc" ] && [ -n "$pc_lib" ]; then + for inc_cand in "$pc_inc" "$pc_inc/pdfium"; do + if check_pair "$inc_cand" "$pc_lib"; then + USE_SYSTEM=1 + break + fi + done + fi + fi fi -# RPATH so the installed pdfium.so finds libpdfium relative to itself. -# After install, libpdfium.{so,dylib} sits in /lib/ (from inst/lib) -# and pdfium.so sits in /libs//, so RPATH points one level -# up plus into lib/. +# 3. Standard prefixes. +if [ -z "$USE_SYSTEM" ]; then + for prefix in /usr/local /usr /opt/homebrew /opt/local; do + for inc in "$prefix/include" "$prefix/include/pdfium"; do + for lib in "$prefix/lib" "$prefix/lib64" \ + "$prefix/lib/x86_64-linux-gnu" \ + "$prefix/lib/aarch64-linux-gnu"; do + if check_pair "$inc" "$lib"; then + USE_SYSTEM=1 + break 3 + fi + done + done + done +fi + +# 4. Fall back to download. +if [ -z "$USE_SYSTEM" ]; then + echo "[pdfium configure] No system libpdfium found; downloading binary for $(uname -sm)" + RESOLVED="$("$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT")" + PDFIUM_INCLUDE="$(printf '%s\n' "$RESOLVED" | sed -n '1p')" + PDFIUM_LIB="$(printf '%s\n' "$RESOLVED" | sed -n '2p')" + if [ -z "$PDFIUM_INCLUDE" ] || [ -z "$PDFIUM_LIB" ]; then + echo "ERROR: tools/download-pdfium.R did not return include / lib paths" >&2 + exit 1 + fi +else + echo "[pdfium configure] Using system libpdfium (include=$PDFIUM_INCLUDE, lib=$PDFIUM_LIB)" +fi + +# RPATH so the installed pdfium.so finds the bundled libpdfium +# relative to itself. After install, libpdfium.{so,dylib} sits in +# /lib/ (from inst/lib) and pdfium.so sits in /libs//, +# so RPATH points one level up plus into lib/. # -# The dollar sign survives Make (which sees $$ORIGIN and produces $ORIGIN) -# and the recipe shell (which receives a single-quoted token from the -# heredoc, so $ORIGIN is not parameter-expanded). -case "$(uname -s)" in - Darwin) - RPATH_FLAG="-Wl,-rpath,'@loader_path/../lib'" - ;; - Linux) - RPATH_FLAG='-Wl,-rpath,'"'"'$$ORIGIN/../lib'"'" - ;; - *) - RPATH_FLAG="" - ;; -esac +# The dollar sign survives Make (which sees $$ORIGIN and produces +# $ORIGIN) and the recipe shell (which receives a single-quoted token +# from the heredoc, so $ORIGIN is not parameter-expanded). +# +# When using a system install, the library is on the dynamic loader's +# standard search path and no RPATH is needed. +if [ -n "$USE_SYSTEM" ]; then + RPATH_FLAG="" +else + case "$(uname -s)" in + Darwin) + RPATH_FLAG="-Wl,-rpath,'@loader_path/../lib'" + ;; + Linux) + RPATH_FLAG='-Wl,-rpath,'"'"'$$ORIGIN/../lib'"'" + ;; + *) + RPATH_FLAG="" + ;; + esac +fi cat > "$PKG_ROOT/src/Makevars" <&2 + fi +fi + +if [ -n "$USE_SYSTEM" ]; then + echo "[pdfium configure.win] Staging libpdfium from PDFIUM_HOME=$PDFIUM_HOME" + mkdir -p "$PKG_ROOT/inst/include" "$PKG_ROOT/inst/lib" "$PKG_ROOT/inst/bin" + cp -R "$PDFIUM_HOME/include/." "$PKG_ROOT/inst/include/" + cp "$PDFIUM_HOME/lib/libpdfium.dll.a" "$PKG_ROOT/inst/lib/" + cp "$PDFIUM_HOME/bin/libpdfium.dll" "$PKG_ROOT/inst/bin/" +else + echo "[pdfium configure.win] No system libpdfium found; downloading binary for Windows" + "$RSCRIPT" --vanilla "$PKG_ROOT/tools/download-pdfium.R" "$PKG_ROOT" +fi echo "[pdfium configure.win] inst/{include,lib,bin} populated." diff --git a/cran-comments.md b/cran-comments.md index 30d448e..ecaed96 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,135 +2,51 @@ ## Summary -`pdfium` is a new R package providing idiomatic bindings to Google's -PDFium PDF engine via Rcpp. It complements `pdftools` (Poppler) and -`qpdf` (QPDF), filling two gaps no other CRAN package fills: - -* **Vector-path geometry on read** — segment kinds, control points, - stroke / fill style, transformation matrices, clip paths, blend - modes — alongside text, fonts, images, annotations, form fields, - attachments, signatures, structure tree, bookmarks, named - destinations, and rendering. `pdf_extract_paths()` returns a - tibble matching the schema kmextract's pypdfium2 backend ships - today. -* **A focused mutation surface** opt-in via `readwrite = TRUE` on - `pdf_doc_open()` (or `pdf_doc_new()` for fresh documents): - structural mutation (page rotate / delete / reorder / merge / - box / language), page-object styling setters, path-geometry - rebuild, page-object creation (paths, rectangles, text, JPEG - images), annotation authoring (14 supported subtypes), form - filling + flattening, attachment authoring, plus standard-font - and custom-font (TrueType / Type1) embedding. `pdf_save()` - writes atomically. - -Both halves are documented in pkgdown -() and exercised at 100% R -coverage in CI. - -## Test environments - -The R-CMD-check matrix in `.github/workflows/R-CMD-check.yaml` -covers: - -* Ubuntu 24.04, R-release / R-devel / R-oldrel-1 -* macOS-latest, R-release -* Windows-latest, R-release - -`R CMD check --as-cran` locally on Ubuntu 24.04 with R 4.6.0: -0 ERRORs, 0 WARNINGs when `checkbashisms` is installed, 1 NOTE -(detailed below). The cross-platform CI matrix -() -is green on every cell on the head of `main`. - -## Expected NOTEs - -* **"Compilation used the following non-portable flag(s): - '-mno-omit-leaf-frame-pointer'"** — inherited from the Debian / - Ubuntu `r-base` package's default `CXX17FLAGS`. The pdfium - package itself does not pass this flag in its `Makevars`; it - appears only when R itself was built on Debian-family systems - with that flag set in `etc/Makeconf`. No NOTE seen on - macOS-latest or Windows-latest CI cells. - -* **"Installed package size … Mb"** — *may appear* on systems - where `inst/lib/libpdfium` ends up at 10–15 MB (the bundled - libpdfium shared library). We download it at install time - rather than shipping it in the source tarball, so the tarball - itself is well under CRAN's 5 MB limit (~1 MB). - -* **"GNU make is a SystemRequirements"** — *may appear* on - platforms where the `configure` script triggers a GNU-make - feature. The package declares `SystemRequirements: C++17, - libpdfium (downloaded automatically at install time)` to make - this explicit; the `configure` script downloads the matching - bblanchon binary on demand and `cleanup` removes intermediate - artefacts. - -## Network access at install time - -The `configure` (POSIX) and `configure.win` (Windows) scripts -fetch the bblanchon `libpdfium` binary on first install. The -script: - -* Honors `CRAN_PDFIUM_OFFLINE=1` as a hard opt-out for the CRAN - build farm. -* Falls back to a prepopulated `inst/pdfium-binaries/` directory - when one is present. -* Errors with a clear message — and a `configure` exit code that - surfaces in `install.packages()` — when the network is - unavailable and no fallback is present. - -The pinned release URL and SHA-256 live in -`tools/pdfium-version.txt`; any change to the pin requires a new -ADR entry under `dev/decisions/`. The download URL points at -GitHub releases (`https://github.com/bblanchon/pdfium-binaries/...`) -which is in CRAN's allowlist of acceptable fetch sources for -`arrow`, `duckdb`, and other binary-heavy packages. - -No network access is required to run the package after install. -Tests use only the bundled fixtures under `inst/extdata/fixtures/`; -examples either use those fixtures or are wrapped in -`if (nzchar(fixture)) { ... }` so they no-op when the package is -not yet installed. - -## Reverse dependencies - -This is a first submission; there are no reverse dependencies -yet. The internal consumer (`kmextract`, currently using -`pypdfium2` via reticulate) will switch to `pdfium` as a backend -after this release; its conformance suite has been run against -the v0.1.0 candidate. - -## Examples runtime - -Every documented function has a runnable example. The longest -single example runs in under 200 ms on a 2024 Linux laptop; -the full `R CMD check` example pass completes in well under -60 seconds. No example uses `\dontrun{}`; all use -`if (nzchar(system.file(...))) { ... }` to no-op when the -fixture is missing. - -## CRAN policy compliance checklist - -* [x] No writes outside `tempdir()` and the package install - directory. -* [x] No network access during `R CMD check` (download is at - install time only; tests use bundled fixtures). -* [x] No `\dontrun{}` examples. -* [x] Examples runtime < 5 s each; full pass < 60 s. -* [x] No `<<-` writes to `.GlobalEnv` or anywhere outside the - package namespace. -* [x] No interactive prompts at install or load time. -* [x] All Suggests packages are on CRAN and used via - `requireNamespace()` / `skip_if_not_installed()` where - appropriate. -* [x] Mutators require an explicit `readwrite = TRUE` opt-in on - `pdf_doc_open()` so accidental edits inside a read-only - pipeline raise a clear error rather than silently mutating - the document. - -## Licence - -Package code: MIT (with file LICENSE). -Bundled `libpdfium` binary: BSD-3-Clause. The combined provenance -and per-file attribution live in `LICENSE.md`. +`pdfium` provides idiomatic R bindings to Google's 'PDFium' PDF +engine via Rcpp. It exposes more of a PDF's internals — vector- +path geometry, annotations, form fields, attachments, +signatures, structure tree, named destinations, viewer +preferences, and a focused mutation surface — than any other R +package on CRAN today. + +## Resubmission — addressing prior reviewer feedback + +* **Single-quoted 'PDFium'** in the Title and Description + fields of DESCRIPTION. + +* **`configure` prefers an existing system libpdfium before + downloading.** On POSIX, the selection order is: + + 1. PDFIUM_HOME env var pointing at a usable install + 2. pkg-config --exists libpdfium + 3. /usr/local, /usr, /opt/homebrew, /opt/local + + Only if none of those resolve does it fall back to the + bblanchon binary download. `configure.win` honors + PDFIUM_HOME similarly (Windows has no canonical system + install location, so it does not guess). + +* **`_exit` / `abort` / `exit` symbols** — root cause + identified and fixed. `tools::check_compiled_code()` on + Windows reads `libs//symbols.rds` (an R-generated + per-`.o`-file symbol table) when `_R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE` + (which CRAN sets). Without that file installed alongside + the package's DLL, the check falls back to scanning the + DLL's import table — which on every Rtools/MinGW-built + Windows shared library imports `_exit`/`abort`/`exit` from + the Universal CRT (libgcc / libstdc++ / libmingw32.a all + reference them from their runtime/terminate machinery, + whether or not the user's code calls them). The package's + own compiled `.o` files contain zero references to these + symbols (verified with `nm --undefined-only` on a + production build). + + This package ships an `install.libs.R` script that replaces + R's default install logic for `src/*.so/.dll`. The previous + version did not propagate `src/symbols.rds` into the + installed `libs//`. The fix in this submission copies + `symbols.rds` when present, restoring the behaviour R + performs by default for packages without a custom + `install.libs.R`. With `symbols.rds` in place, + `tools::check_compiled_code()` returns no findings on the + installed package. diff --git a/inst/WORDLIST b/inst/WORDLIST index 6cc34f8..9fd3e71 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,91 +1,246 @@ -AcroForm +ABGR ADR -affine +AES +AcroForm ArtBox -bblanchon +BCP +BDC +BGR +BGRA +BGRx +BaseFont Bezier -bitmap's -bitmask BleedBox +CMD +CMap +CPDF +CTM CalGray CalRGB -CMap -CMD Codecov -codepoint -codepoints -colour -colours -conformant +Combobox CropBox -CTM DER +Dest DeviceCMYK DeviceGray DeviceN DeviceRGB DocMDP -drawable -finalisers -finalizer +EF +FitBH +FitBV +FitH +FitR +FitV Flate +FreeText GC +GID +Gerrit +GoTo ICCBased +IETF +ImageMagick JBIG +JRE +JS Lifecycle -listbox +LineTo +MCID +MacRoman MediaBox MediaBox's +MoveTo MultiLine NA's -nativeRaster -neighbour -NoExport -npc NUL +NoExport +NonFullScreenPageMode +NumCopies +Nx ORCID -PageMode +OTF +OpenAction PDF's +PDFDocEncoding PDFium PDFium's -pdftools PKCS -pkgdown PKI +POSIXct +PageLayout +PageMode +PickTrayByPDFSize Poppler +Pre +PrintArea +PrintClip +PrintPageRange +PrintScaling +RGBA +RPATH +Rasterises +Rcpp +ReadOnly +Serialises +TTF +Tabula's +Tibble +ToUnicode +TrimBox +TrueType +UA +UI +URL's +UserAccess +ViewArea +ViewClip +ViewerPreferences +Visualising +WinAnsi +XFA +XMP +XObject +XObject's +XObjects +XYZ +ZapfDingbats +affine +allowlist +analysing +annot +annot's +appender +appenders +bblanchon +behaviour +behavioural +bezierto +bitmap's +bitmask +callouts +charcodes +charset +charsets +checkable +checksums +cmap +codepoint +codepoints +colour +colours +combobox +conformant +cropbox +de +dereferencing +dest +dest's +drawable +embedder +embedder's +enum +externalptr +externalptr's +extractable +facto +filesystem +finalisers +finalizer +fitb +fitbh +fitbv +fith +fitr +fitv +flavour +focusable +getter +getters +glyph's +goto +honours +i'th +indexable +ints +lang +linearise +lineto +listbox +lossy +magick +marshalling +materialisation +mediabox +moveto +mutator +mutators +nativeRaster +neighbour +normalise +npc +op'ing +pdftk +pdftools +pkgdown +polyline positionally -POSIXct pre +pushbutton pypdfium qpdf +quadpoints +radiobutton rasterImage rasterise rasterised rasterises -Rasterises -Rcpp -ReadOnly +rasterize +readwrite recognises +rect +reflow renderer -RGBA +representable +rg rollup -RPATH +rpdfium +serialise +serialised +specialisation +squigglies +staplr +subdictionary subpath +subprocess subtype +subtypes synthesised +sys +tabulapdf teardown +tempfile +textfield textfields th tibble tibble's tibbles -TrimBox +toc +triaging +uint un unencrypted +unicode unparseable +unresolvable unstroked -UserAccess -XFA -XObject -XObject's -XObjects +untagged +uri +walkthrough +widget's +xfa +xmpdf +xobject +xyz diff --git a/man/pdf_form_fields.Rd b/man/pdf_form_fields.Rd index ab52002..b0ac895 100644 --- a/man/pdf_form_fields.Rd +++ b/man/pdf_form_fields.Rd @@ -31,7 +31,7 @@ decoded universal flag bits (bits 1, 2, 3) for convenience. \code{TRUE} / \code{FALSE} for \code{checkbox} / \code{radiobutton} fields, \code{NA} for every other field type. \item \code{control_count} integer - total number of widgets in this -field's control group (≥ 1; \verb{> 1} for radio button groups +field's control group (\verb{>= 1}; \verb{> 1} for radio button groups with multiple physical widgets). \code{NA} if PDFium reports failure. \item \code{control_index} integer - 0-based position of this row's diff --git a/man/pdf_page_bounding_box.Rd b/man/pdf_page_bounding_box.Rd index cfdbdbc..bff83ca 100644 --- a/man/pdf_page_bounding_box.Rd +++ b/man/pdf_page_bounding_box.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/api_completion.R \name{pdf_page_bounding_box} \alias{pdf_page_bounding_box} -\title{Page bounding box (cropbox ∩ mediabox)} +\title{Page bounding box (cropbox intersect mediabox)} \usage{ pdf_page_bounding_box(page) } diff --git a/man/pdfium-package.Rd b/man/pdfium-package.Rd index 4f137c8..a4be235 100644 --- a/man/pdfium-package.Rd +++ b/man/pdfium-package.Rd @@ -22,10 +22,23 @@ rendering) arrive in subsequent releases. \section{Binary distribution}{ -The underlying \code{libpdfium} shared library is downloaded from -\href{https://github.com/bblanchon/pdfium-binaries}{bblanchon/pdfium-binaries} -the first time the package is installed. The pinned version lives in -\code{tools/pdfium-version.txt}. +At install time, the \code{configure} script picks a \code{libpdfium} to +build against, in this order: +\enumerate{ +\item The \code{PDFIUM_HOME} environment variable, if it points at a +directory containing \code{include/fpdfview.h} and a +\code{libpdfium} shared library (\verb{lib/libpdfium.\{so,dylib\}} on +POSIX, or \code{lib/libpdfium.dll.a} + \code{bin/libpdfium.dll} on +Windows). +\item \verb{pkg-config --exists libpdfium} (POSIX only). +\item Standard system prefixes: \verb{/usr/local}, \verb{/usr}, +\verb{/opt/homebrew}, \verb{/opt/local} (POSIX only). +\item Download from +\href{https://github.com/bblanchon/pdfium-binaries}{bblanchon/pdfium-binaries}. +The pinned release lives in \code{tools/pdfium-version.txt}. +Set \code{PDFIUM_OFFLINE=1} and stage the tarball under +\verb{inst/pdfium-binaries/} for offline installs. +} } \seealso{ diff --git a/src/install.libs.R b/src/install.libs.R index 229a126..cd3cb3a 100644 --- a/src/install.libs.R +++ b/src/install.libs.R @@ -44,6 +44,22 @@ local({ shlib_src, dest)) } + # 1b. Copy the per-object symbol table that R CMD INSTALL writes + # when _R_SHLIB_BUILD_OBJECTS_SYMBOL_TABLES_=TRUE (the setting CRAN + # uses). Without symbols.rds in the installed libs//, the + # check_compiled_code() pass on Windows can't see what was in our + # .o files and falls back to scanning the .dll's import table — + # which always contains _exit/abort/exit from the MinGW static + # runtime, producing a spurious NOTE. Default R install logic + # would copy this automatically; we replicate it here because + # this script replaces that default. + for (sym_src in c("symbols.rds", file.path(paste0("src", R_ARCH), "symbols.rds"))) { + if (file.exists(sym_src)) { + file.copy(sym_src, file.path(dest, "symbols.rds"), overwrite = TRUE) + break + } + } + # 2. Windows: copy bblanchon's libpdfium.dll next to our DLL. if (.Platform$OS.type == "windows") { pkg_root <- R_PACKAGE_SOURCE