#!/usr/bin/env bash # # substack-filter.sh — filter a list of URLs to only those redirect to / hosted on Substack # # Author: Gwern Branwen # Date: 2026-02-06 # License: CC-0 # # Usage: # ./substack-filter.sh [INPUT_FILE] [OUTPUT_FILE] # cat urls.txt | ./substack-filter.sh # ./substack-filter.sh urls.txt substack-urls.txt # # This is useful for finding domains to mirror or treat specially. # One could also use it to analyze your browsing or bookmark history # to find Substack-using URLs (perhaps to 'like' them en masse), # eg. `firefox $(cat ~/wiki/doc/meta/2026-02-06-gwern-gwernnet-allsubstackusingurlsgeneratedbysubstackprobesh.txt)` # # Approach: # 1. *.substack.com domains are kept immediately. # 2. Unknown domains are checked via parallel DNS CNAME checks # (Substack custom domains CNAME to target.substack-custom-domains.com). # 3. Remaining unknowns get a parallel HTTP HEAD check for the # 'x-served-by: Substack' header (catches apex domains and # Cloudflare-proxied setups where CNAME is flattened). # # Notes: # - Bash-only; GNU userland assumed. # - Fails fast; recoverable network errors are explicitly ignored. # # # Example script to open the results systematically in Chromium: # # ## INPUT="/home/gwern/2026-02-06-gwern-gwernnet-allsubstackusingurlsgeneratedbysubstackchecksh.txt" ## BATCH="${2:-20}" ## START="${3:-1}" ## ## tail --lines="+${START}" -- "$INPUT" | while mapfile -t -n "$BATCH" URLS && (( ${#URLS[@]} )); do ## chromium "${URLS[@]}" & ## START=$(( START + ${#URLS[@]} )) ## echo >&2 "Opened ${#URLS[@]} tabs. Next batch: START=${START}. Press Enter to continue, Ctrl-C to stop..." ## read -r < /dev/tty ## done set -e ######################################## # Configuration ######################################## INPUT="${1:--}" OUTPUT="${2:-substack-urls.txt}" PARALLEL_DNS="${PARALLEL_DNS:-50}" PARALLEL_HEAD="${PARALLEL_HEAD:-10}" ######################################## # Helpers ######################################## require_cmds() { local missing=() for cmd in "$@"; do command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd") done if (( ${#missing[@]} )); then echo "Missing required commands: ${missing[*]}" >&2 exit 1 fi } ######################################## # Pre-flight ######################################## require_cmds dig curl xargs sed wc grep ######################################## # Main ######################################## # Read all URLs into memory mapfile -t URLS < <(cat "$INPUT") declare -A DOMAIN_IS_SUBSTACK declare -A DOMAIN_SAMPLE_URL UNKNOWN_DOMAINS=() ## Phase 1: classify domains by name alone echo >&2 "Phase 1: classifying domains..." for url in "${URLS[@]}"; do domain=$(echo "$url" | sed --regexp-extended 's|^https?://([^/]+).*|\1|') if [[ "$domain" == *.substack.com || "$domain" == substack.com ]]; then DOMAIN_IS_SUBSTACK["$domain"]=1 elif [[ -z "${DOMAIN_IS_SUBSTACK[$domain]+x}" ]]; then DOMAIN_IS_SUBSTACK["$domain"]=unknown DOMAIN_SAMPLE_URL["$domain"]="$url" UNKNOWN_DOMAINS+=("$domain") fi done echo >&2 "${#DOMAIN_IS_SUBSTACK[@]} unique domains (${#UNKNOWN_DOMAINS[@]} to check)" ## Phase 2: parallel CNAME checks (fast, no HTTP) echo >&2 "Phase 2: CNAME check (parallelism=${PARALLEL_DNS})..." CNAME_RESULTS=$(printf '%s\n' "${UNKNOWN_DOMAINS[@]}" \ | xargs --delimiter='\n' --max-procs="$PARALLEL_DNS" --replace={} sh -c ' domain="$1" base="${domain#www.}" # Check the domain as-is (catches blog.*, newsletter.*, etc.) if dig +short +time=2 +tries=1 CNAME "$domain" 2>/dev/null \ | grep --quiet --ignore-case "substack-custom-domains"; then echo "$domain" # Fall back to www. prefix (catches bare domains whose www. CNAMEs to Substack) elif dig +short +time=2 +tries=1 CNAME "www.${base}" 2>/dev/null \ | grep --quiet --ignore-case "substack-custom-domains"; then echo "$domain" fi ' _ {}) while IFS= read -r domain; do [[ -z "$domain" ]] && continue DOMAIN_IS_SUBSTACK["$domain"]=1 echo >&2 " $domain → Substack (CNAME)" done <<< "$CNAME_RESULTS" ## Phase 3: parallel HEAD fallback for remaining unknowns ## (catches apex domains which can't CNAME, and Cloudflare-proxied domains ## where the CNAME is flattened into A records) REMAINING=() for domain in "${UNKNOWN_DOMAINS[@]}"; do [[ "${DOMAIN_IS_SUBSTACK[$domain]}" != "unknown" ]] && continue REMAINING+=("$domain") done echo >&2 "Phase 3: HEAD checks for ${#REMAINING[@]} remaining domains (parallelism=${PARALLEL_HEAD})..." HEAD_RESULTS=$(for domain in "${REMAINING[@]}"; do echo "$domain ${DOMAIN_SAMPLE_URL[$domain]}" done | xargs --delimiter='\n' --max-procs="$PARALLEL_HEAD" --max-lines=1 sh -c ' domain="$1"; url="$2" if curl --silent --head --location \ --max-time 10 \ --max-redirs 5 \ "$url" 2>/dev/null \ | grep --quiet --ignore-case "^x-served-by: Substack"; then echo "$domain" fi ' _) while IFS= read -r domain; do [[ -z "$domain" ]] && continue DOMAIN_IS_SUBSTACK["$domain"]=1 echo >&2 " $domain → Substack (HEAD)" done <<< "$HEAD_RESULTS" ## Phase 4: emit matching URLs echo >&2 "Phase 4: writing output..." > "$OUTPUT" for url in "${URLS[@]}"; do domain=$(echo "$url" | sed --regexp-extended 's|^https?://([^/]+).*|\1|') if [[ "${DOMAIN_IS_SUBSTACK[$domain]:-0}" == "1" ]]; then echo "$url" >> "$OUTPUT" fi done COUNT=$(wc --lines < "$OUTPUT") echo >&2 "Done. Wrote $COUNT Substack URLs to $OUTPUT"