From 53be3ee025875f9a7615d835deb1e0cf4a3a1b69 Mon Sep 17 00:00:00 2001 From: nthmost Date: Tue, 23 Jun 2026 19:00:08 -0700 Subject: [PATCH] wiki_dumps: strip the 86 page from public dumps The public dump invites bots and AI scrapers to ingest the whole wiki (see index.json's note_to_bots), but dumpBackup.php exports the 86 page along with everything else. That undoes the robots.txt Disallow/Noindex on /wiki/86 -- the dump becomes the larger exposure vector. Add a post-export filter that drops excluded base titles plus their subpages (Base/...) and Talk pages (Talk:Base, Talk:Base/...) before publishing. Titles are configurable via wiki_dumps_exclude_titles (default: 86), kept in sync with roles/mediawiki/files/robots.txt. The EXCLUDE_TITLES env var is passed through the systemd service unit. If the filter fails the dump aborts (set -e) rather than publishing an unfiltered file, so latest.xml.gz keeps pointing at the last good dump. --- roles/wiki_dumps/defaults/main.yml | 6 ++ roles/wiki_dumps/files/dump_filter.py | 81 +++++++++++++++++++ roles/wiki_dumps/files/wiki_dump.sh | 17 +++- roles/wiki_dumps/tasks/main.yml | 8 ++ .../wiki_dumps/templates/wiki_dump.service.j2 | 1 + 5 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 roles/wiki_dumps/files/dump_filter.py diff --git a/roles/wiki_dumps/defaults/main.yml b/roles/wiki_dumps/defaults/main.yml index b7f8ed29..da4920f1 100644 --- a/roles/wiki_dumps/defaults/main.yml +++ b/roles/wiki_dumps/defaults/main.yml @@ -6,6 +6,12 @@ wiki_dumps_localsettings: "/srv/mediawiki/{{ mediawiki.domain }}/LocalSettings.p wiki_dumps_public_dir: /var/www/dumps.noisebridge.net wiki_dumps_public_keep_days: 7 +# Base page titles stripped from the public dump, along with their subpages +# (Base/...) and Talk pages (Talk:Base, Talk:Base/...). Keep in sync with the +# Disallow/Noindex entries in roles/mediawiki/files/robots.txt. +wiki_dumps_exclude_titles: + - "86" + # Timer schedule (default: 2AM daily) wiki_dumps_cron_hour: 2 wiki_dumps_cron_minute: 0 diff --git a/roles/wiki_dumps/files/dump_filter.py b/roles/wiki_dumps/files/dump_filter.py new file mode 100644 index 00000000..5308301c --- /dev/null +++ b/roles/wiki_dumps/files/dump_filter.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""Strip excluded pages from a MediaWiki XML dump before it is published. + +Reads a gzipped MediaWiki export, removes any whose title matches an +excluded base title -- the base itself, its subpages (``Base/...``), and the +corresponding Talk pages (``Talk:Base`` and ``Talk:Base/...``) -- and writes a +gzipped, filtered dump. + +Usage: dump_filter.py SRC.xml.gz DST.xml.gz [EXCLUDE_TITLES] + EXCLUDE_TITLES is a comma-separated list of base titles (default "86"). + +Exits non-zero on any error so the caller can refuse to publish an unfiltered +dump (better to publish nothing than to leak an excluded page). +""" +import gzip +import sys +import xml.etree.ElementTree as ET + +XSI = "http://www.w3.org/2001/XMLSchema-instance" + + +def excluded_matcher(bases): + variants = set() + for base in bases: + base = base.strip() + if not base: + continue + variants.add(base) + variants.add(f"Talk:{base}") + + def is_excluded(title): + if title is None: + return False + for v in variants: + if title == v or title.startswith(v + "/"): + return True + return False + + return is_excluded + + +def main(): + if len(sys.argv) < 3: + sys.exit(f"usage: {sys.argv[0]} SRC.xml.gz DST.xml.gz [EXCLUDE_TITLES]") + src, dst = sys.argv[1], sys.argv[2] + bases = (sys.argv[3] if len(sys.argv) > 3 else "86").split(",") + is_excluded = excluded_matcher(bases) + + with gzip.open(src, "rb") as fh: + tree = ET.parse(fh) + root = tree.getroot() + + # Preserve the export namespace (version varies: 0.10, 0.11, ...) as the + # default so the output stays byte-clean rather than gaining ns0: prefixes. + ns_uri = root.tag[root.tag.find("{") + 1:root.tag.find("}")] + ET.register_namespace("", ns_uri) + ET.register_namespace("xsi", XSI) + + def title_of(page): + node = page.find(f"{{{ns_uri}}}title") + return node.text if node is not None else None + + removed = [] + for page in list(root.findall(f"{{{ns_uri}}}page")): + title = title_of(page) + if is_excluded(title): + root.remove(page) + removed.append(title) + + with gzip.open(dst, "wb") as fh: + tree.write(fh, encoding="utf-8", xml_declaration=True) + + if removed: + sys.stderr.write(f"dump_filter: removed {len(removed)} page(s): " + f"{', '.join(removed)}\n") + else: + sys.stderr.write("dump_filter: no excluded pages found\n") + + +if __name__ == "__main__": + main() diff --git a/roles/wiki_dumps/files/wiki_dump.sh b/roles/wiki_dumps/files/wiki_dump.sh index 51bc1013..452c9f1a 100644 --- a/roles/wiki_dumps/files/wiki_dump.sh +++ b/roles/wiki_dumps/files/wiki_dump.sh @@ -13,6 +13,11 @@ if [[ ! -d "${PUBLIC_DIR}" ]]; then fi PUBLIC_KEEP_DAYS="${PUBLIC_KEEP_DAYS:-7}" +# Comma-separated base titles to strip from the public dump (incl. their +# subpages and Talk pages). Keep in sync with roles/mediawiki/files/robots.txt. +EXCLUDE_TITLES="${EXCLUDE_TITLES:-86}" +FILTER="${FILTER:-/usr/local/sbin/wiki_dump_filter}" + PHP="${PHP:-php}" TS=$(date -u '+%Y%m%d') OUTFILE="${PUBLIC_DIR}/noisebridge-${TS}-public.xml.gz" @@ -21,7 +26,17 @@ echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Starting public dump -> ${OUTFILE}" "${PHP}" "${MEDIAWIKI_PATH}/maintenance/dumpBackup.php" \ --conf "${LOCALSETTINGS}" \ --current \ - --output "gzip:${OUTFILE}.tmp" + --output "gzip:${OUTFILE}.raw.gz" + +# Strip excluded pages before publishing. If the filter fails we abort (set -e) +# rather than publish an unfiltered dump that would leak those pages. +if [[ -n "${EXCLUDE_TITLES}" ]]; then + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Filtering excluded pages: ${EXCLUDE_TITLES}" + python3 "${FILTER}" "${OUTFILE}.raw.gz" "${OUTFILE}.tmp" "${EXCLUDE_TITLES}" + rm -f "${OUTFILE}.raw.gz" +else + mv "${OUTFILE}.raw.gz" "${OUTFILE}.tmp" +fi mv "${OUTFILE}.tmp" "${OUTFILE}" ln -sf "${OUTFILE}" "${PUBLIC_DIR}/latest.xml.gz" echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Completed dump ($(du -sh "${OUTFILE}" | cut -f1))" diff --git a/roles/wiki_dumps/tasks/main.yml b/roles/wiki_dumps/tasks/main.yml index 260fe873..75d533c9 100644 --- a/roles/wiki_dumps/tasks/main.yml +++ b/roles/wiki_dumps/tasks/main.yml @@ -35,6 +35,14 @@ owner: root group: root +- name: Deploy dump filter + ansible.builtin.copy: + src: dump_filter.py + dest: /usr/local/sbin/wiki_dump_filter + mode: "0755" + owner: root + group: root + - name: Deploy systemd service unit ansible.builtin.template: src: wiki_dump.service.j2 diff --git a/roles/wiki_dumps/templates/wiki_dump.service.j2 b/roles/wiki_dumps/templates/wiki_dump.service.j2 index 2071fce4..3ebef621 100644 --- a/roles/wiki_dumps/templates/wiki_dump.service.j2 +++ b/roles/wiki_dumps/templates/wiki_dump.service.j2 @@ -10,5 +10,6 @@ Environment=MEDIAWIKI_PATH={{ wiki_dumps_mediawiki_path }} Environment=LOCALSETTINGS={{ wiki_dumps_localsettings }} Environment=PUBLIC_DIR={{ wiki_dumps_public_dir }} Environment=PUBLIC_KEEP_DAYS={{ wiki_dumps_public_keep_days }} +Environment=EXCLUDE_TITLES={{ wiki_dumps_exclude_titles | join(',') }} Environment=PHP=/usr/bin/php8.2 ExecStart=/usr/local/sbin/wiki_dump