Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions roles/wiki_dumps/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ wiki_dumps_localsettings: "/srv/mediawiki/{{ mediawiki.domain }}/LocalSettings.p
wiki_dumps_public_dir: /var/www/dumps.noisebridge.net
wiki_dumps_public_keep_days: 7

# Base page titles stripped from the public dump, along with their subpages
# (Base/...) and Talk pages (Talk:Base, Talk:Base/...). Keep in sync with the
# Disallow/Noindex entries in roles/mediawiki/files/robots.txt.
wiki_dumps_exclude_titles:
- "86"

# Timer schedule (default: 2AM daily)
wiki_dumps_cron_hour: 2
wiki_dumps_cron_minute: 0
81 changes: 81 additions & 0 deletions roles/wiki_dumps/files/dump_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""Strip excluded pages from a MediaWiki XML dump before it is published.

Reads a gzipped MediaWiki export, removes any <page> whose title matches an
excluded base title -- the base itself, its subpages (``Base/...``), and the
corresponding Talk pages (``Talk:Base`` and ``Talk:Base/...``) -- and writes a
gzipped, filtered dump.

Usage: dump_filter.py SRC.xml.gz DST.xml.gz [EXCLUDE_TITLES]
EXCLUDE_TITLES is a comma-separated list of base titles (default "86").

Exits non-zero on any error so the caller can refuse to publish an unfiltered
dump (better to publish nothing than to leak an excluded page).
"""
import gzip
import sys
import xml.etree.ElementTree as ET

XSI = "http://www.w3.org/2001/XMLSchema-instance"


def excluded_matcher(bases):
variants = set()
for base in bases:
base = base.strip()
if not base:
continue
variants.add(base)
variants.add(f"Talk:{base}")

def is_excluded(title):
if title is None:
return False
for v in variants:
if title == v or title.startswith(v + "/"):
return True
return False

return is_excluded


def main():
if len(sys.argv) < 3:
sys.exit(f"usage: {sys.argv[0]} SRC.xml.gz DST.xml.gz [EXCLUDE_TITLES]")
src, dst = sys.argv[1], sys.argv[2]
bases = (sys.argv[3] if len(sys.argv) > 3 else "86").split(",")
is_excluded = excluded_matcher(bases)

with gzip.open(src, "rb") as fh:
tree = ET.parse(fh)
root = tree.getroot()

# Preserve the export namespace (version varies: 0.10, 0.11, ...) as the
# default so the output stays byte-clean rather than gaining ns0: prefixes.
ns_uri = root.tag[root.tag.find("{") + 1:root.tag.find("}")]
ET.register_namespace("", ns_uri)
ET.register_namespace("xsi", XSI)

def title_of(page):
node = page.find(f"{{{ns_uri}}}title")
return node.text if node is not None else None

removed = []
for page in list(root.findall(f"{{{ns_uri}}}page")):
title = title_of(page)
if is_excluded(title):
root.remove(page)
removed.append(title)

with gzip.open(dst, "wb") as fh:
tree.write(fh, encoding="utf-8", xml_declaration=True)

if removed:
sys.stderr.write(f"dump_filter: removed {len(removed)} page(s): "
f"{', '.join(removed)}\n")
else:
sys.stderr.write("dump_filter: no excluded pages found\n")


if __name__ == "__main__":
main()
17 changes: 16 additions & 1 deletion roles/wiki_dumps/files/wiki_dump.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ if [[ ! -d "${PUBLIC_DIR}" ]]; then
fi
PUBLIC_KEEP_DAYS="${PUBLIC_KEEP_DAYS:-7}"

# Comma-separated base titles to strip from the public dump (incl. their
# subpages and Talk pages). Keep in sync with roles/mediawiki/files/robots.txt.
EXCLUDE_TITLES="${EXCLUDE_TITLES:-86}"
FILTER="${FILTER:-/usr/local/sbin/wiki_dump_filter}"

PHP="${PHP:-php}"
TS=$(date -u '+%Y%m%d')
OUTFILE="${PUBLIC_DIR}/noisebridge-${TS}-public.xml.gz"
Expand All @@ -21,7 +26,17 @@ echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Starting public dump -> ${OUTFILE}"
"${PHP}" "${MEDIAWIKI_PATH}/maintenance/dumpBackup.php" \
--conf "${LOCALSETTINGS}" \
--current \
--output "gzip:${OUTFILE}.tmp"
--output "gzip:${OUTFILE}.raw.gz"

# Strip excluded pages before publishing. If the filter fails we abort (set -e)
# rather than publish an unfiltered dump that would leak those pages.
if [[ -n "${EXCLUDE_TITLES}" ]]; then
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Filtering excluded pages: ${EXCLUDE_TITLES}"
python3 "${FILTER}" "${OUTFILE}.raw.gz" "${OUTFILE}.tmp" "${EXCLUDE_TITLES}"
rm -f "${OUTFILE}.raw.gz"
else
mv "${OUTFILE}.raw.gz" "${OUTFILE}.tmp"
fi
mv "${OUTFILE}.tmp" "${OUTFILE}"
ln -sf "${OUTFILE}" "${PUBLIC_DIR}/latest.xml.gz"
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] Completed dump ($(du -sh "${OUTFILE}" | cut -f1))"
Expand Down
8 changes: 8 additions & 0 deletions roles/wiki_dumps/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
owner: root
group: root

- name: Deploy dump filter
ansible.builtin.copy:
src: dump_filter.py
dest: /usr/local/sbin/wiki_dump_filter
mode: "0755"
owner: root
group: root

- name: Deploy systemd service unit
ansible.builtin.template:
src: wiki_dump.service.j2
Expand Down
1 change: 1 addition & 0 deletions roles/wiki_dumps/templates/wiki_dump.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ Environment=MEDIAWIKI_PATH={{ wiki_dumps_mediawiki_path }}
Environment=LOCALSETTINGS={{ wiki_dumps_localsettings }}
Environment=PUBLIC_DIR={{ wiki_dumps_public_dir }}
Environment=PUBLIC_KEEP_DAYS={{ wiki_dumps_public_keep_days }}
Environment=EXCLUDE_TITLES={{ wiki_dumps_exclude_titles | join(',') }}
Environment=PHP=/usr/bin/php8.2
ExecStart=/usr/local/sbin/wiki_dump