From fea8b64aa52b63940551f349d6da6d6737d5b0db Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Mon, 25 May 2026 20:14:31 +0800
Subject: [PATCH] Use parallel curl downloads for website link checks

---
 .github/workflows/links.yml | 65 ++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml
index 8f4959aaac4..080d0c0cec9 100644
--- a/.github/workflows/links.yml
+++ b/.github/workflows/links.yml
@@ -67,7 +67,7 @@ jobs:
 
           # Download initial sitemap and process
           echo "Downloading sitemap..."
-          SITEMAP=$(wget --compression=auto -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; }
+          SITEMAP=$(curl --compressed -fsSL "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; }
           echo "$SITEMAP" | parse_sitemap > urls.txt
 
           # Process any subsitemaps if they exist
@@ -77,7 +77,7 @@ jobs:
             grep -v 'sitemap' urls.txt > urls.tmp || true
             while read -r submap; do
               echo "Processing submap: $submap"
-              SUBMAP_CONTENT=$(wget --compression=auto -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; }
+              SUBMAP_CONTENT=$(curl --compressed -fsSL "$submap") || { echo "Failed to download submap: $submap"; continue; }
               echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp
             done < subsitemaps.txt
             mv urls.tmp urls.txt || true
@@ -90,26 +90,47 @@ jobs:
       - name: Download Website
         continue-on-error: true
         run: |
-          # Set higher wait seconds for discourse community to avoid 429 rate limit errors
-          if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then
-            WAIT=1
-          else
-            WAIT=0.001
-          fi
-
-          # Download all URLs
-          wget \
-          --compression=auto \
-          --adjust-extension \
-          --reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
-          --input-file=urls.txt \
-          --no-clobber \
-          --no-parent \
-          --wait=$WAIT \
-          --random-wait \
-          --tries=3 \
-          --no-verbose \
-          --force-directories
+          # Download all URLs as decompressed local HTML while using Brotli/gzip over the wire.
+          python - <<'PY'
+          from pathlib import Path
+          from urllib.parse import urlsplit
+
+          reject_suffixes = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".txt")
+
+          def quote(value: str) -> str:
+              return value.replace("\\", "\\\\").replace('"', '\\"')
+
+          count = 0
+          with open("urls.txt", encoding="utf-8") as urls, open("curl-downloads.txt", "w", encoding="utf-8") as config:
+              for url in (line.strip() for line in urls):
+                  if not url:
+                      continue
+                  parsed = urlsplit(url)
+                  path = f"{parsed.netloc}{parsed.path}"
+                  if path.endswith("/"):
+                      output = f"{path}index.html"
+                  elif "." in Path(path).name:
+                      output = path
+                  else:
+                      output = f"{path}.html"
+                  if output.lower().endswith(reject_suffixes) or Path(output).exists():
+                      continue
+                  config.write(f'url = "{quote(url)}"\noutput = "{quote(output)}"\n')
+                  count += 1
+          print(f"Prepared {count} page downloads")
+          PY
+
+          curl --compressed \
+            --fail \
+            --silent \
+            --show-error \
+            --location \
+            --retry 3 \
+            --retry-all-errors \
+            --create-dirs \
+            --parallel \
+            --parallel-max 16 \
+            --config curl-downloads.txt || true
 
       - name: Check image sizes
         if: github.event_name != 'workflow_dispatch' || inputs.check_images