From fea8b64aa52b63940551f349d6da6d6737d5b0db Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Mon, 25 May 2026 20:14:31 +0800 Subject: [PATCH] Use parallel curl downloads for website link checks --- .github/workflows/links.yml | 65 ++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index 8f4959aaac4..080d0c0cec9 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -67,7 +67,7 @@ jobs: # Download initial sitemap and process echo "Downloading sitemap..." - SITEMAP=$(wget --compression=auto -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; } + SITEMAP=$(curl --compressed -fsSL "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; } echo "$SITEMAP" | parse_sitemap > urls.txt # Process any subsitemaps if they exist @@ -77,7 +77,7 @@ jobs: grep -v 'sitemap' urls.txt > urls.tmp || true while read -r submap; do echo "Processing submap: $submap" - SUBMAP_CONTENT=$(wget --compression=auto -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; } + SUBMAP_CONTENT=$(curl --compressed -fsSL "$submap") || { echo "Failed to download submap: $submap"; continue; } echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp done < subsitemaps.txt mv urls.tmp urls.txt || true @@ -90,26 +90,47 @@ jobs: - name: Download Website continue-on-error: true run: | - # Set higher wait seconds for discourse community to avoid 429 rate limit errors - if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then - WAIT=1 - else - WAIT=0.001 - fi - - # Download all URLs - wget \ - --compression=auto \ - --adjust-extension \ - --reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \ - --input-file=urls.txt \ - --no-clobber \ - --no-parent \ - --wait=$WAIT \ - --random-wait \ - --tries=3 \ - --no-verbose \ - --force-directories + # Download all URLs as decompressed local HTML while using Brotli/gzip over the wire. + python - <<'PY' + from pathlib import Path + from urllib.parse import urlsplit + + reject_suffixes = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".txt") + + def quote(value: str) -> str: + return value.replace("\\", "\\\\").replace('"', '\\"') + + count = 0 + with open("urls.txt", encoding="utf-8") as urls, open("curl-downloads.txt", "w", encoding="utf-8") as config: + for url in (line.strip() for line in urls): + if not url: + continue + parsed = urlsplit(url) + path = f"{parsed.netloc}{parsed.path}" + if path.endswith("/"): + output = f"{path}index.html" + elif "." in Path(path).name: + output = path + else: + output = f"{path}.html" + if output.lower().endswith(reject_suffixes) or Path(output).exists(): + continue + config.write(f'url = "{quote(url)}"\noutput = "{quote(output)}"\n') + count += 1 + print(f"Prepared {count} page downloads") + PY + + curl --compressed \ + --fail \ + --silent \ + --show-error \ + --location \ + --retry 3 \ + --retry-all-errors \ + --create-dirs \ + --parallel \ + --parallel-max 16 \ + --config curl-downloads.txt || true - name: Check image sizes if: github.event_name != 'workflow_dispatch' || inputs.check_images