#!/usr/bin/env bash
# Author: Gwern Branwen
# Date: 2016-10-01
# When: Time-stamp: "2026-04-10 16:59:35 gwern"
# License: CC-0
#
# sync-gwern.net.sh: shell script which automates a full build and sync of Gwern.net. A full build is intricate, and requires several passes like generating link-bibliographies/tag-directories, running two kinds of syntax-highlighting, stripping cruft etc.
#
# This script automates all of that: it cleans up, compiles a hakyll binary for faster compilation,
# generates a sitemap XML file, optimizes the MathJax use, checks for many kinds of errors, uploads,
# and cleans up.
# key dependencies: GHC, Hakyll, emacs, curl, tidy (HTML5 version), git, regex-compat-tdfa (Unicode Haskell regexps), urlencode
# ('gridsite-clients' package), linkchecker, fdupes, ImageMagick, exiftool, mathjax-node-page (eg.
# `npm i -g mathjax-node-page`), parallel, xargs, php-cli, php-xml, php-masterminds-html5, libreoffice, gifsicle, tidy, libxml2-utils…
cd ~/wiki/
# shellcheck source=~/wiki/static/build/bash.sh
. ./static/build/bash.sh # import a bunch of Bash utilities for output formatting, checks, file IO etc: red/bold, wrap, gf/gfc/gfv/ge/gec/gev, png2JPGQualityCheck, gwmv...
DEPENDENCIES=(
bc curl shuf dos2unix du elinks emacs exiftool fdupes feh ffmpeg file find firefox
ghc ghci runghc hlint gifsicle git identify inotifywait jpegtran jq libreoffice
linkchecker locate mogrify ocrmypdf pandoc parallel pdftk pdftotext img2pdf php ping
optipng rm rsync sed tidy urlencode x-www-browser xargs xmllint xprintidle
anchor-checker.php generateBacklinks.hs generateDirectory.hs
generateLinkBibliography.hs generateSimilarLinks.hs link-extractor.hs
compressJPG openai chromium inkscape node pngnq advpng docker
should_image_have_outline.php mp3val
) # ~/src/node_modules/mathjax-node-page/bin/mjpage, beautifulsoup-4
declare -A ERROR_OUTPUTS
DEPENDENCIES_MISSING=()
for DEP in "${DEPENDENCIES[@]}"; do
if ! command -v "$DEP" &> /dev/null; then
DEPENDENCIES_MISSING+=("$DEP")
# Attempt to run the missing command to capture its error output.
error_msg=$({ "$DEP" 2>&1; } 2>/dev/null)
ERROR_OUTPUTS["$DEP"]="$error_msg"
fi
done
if [ ${#DEPENDENCIES_MISSING[@]} -ne 0 ]; then
red "Error: missing dependencies!"
for dep in "${DEPENDENCIES_MISSING[@]}"; do
echo "Missing: $dep"
# Print whatever error output was produced when attempting to run the command.
echo "${ERROR_OUTPUTS[$dep]}"
echo
done
exit 1
fi
# cleanup:
rm --recursive --force -- ./_cache/ ./_site/ &
## delete Emacs temporary files which tend to break the build in unpredictable ways:
find ./static/build/ -type f -name "flycheck_*.hs" -delete 2>/dev/null &
find ./ -type f -name "#*\.md#" -delete 2>/dev/null &
MIN_GB="6"
if [ "$(df --block-size=G ~/ | awk 'NR==2 {print $4}' | sed 's/G//')" -lt "$MIN_GB" ]; then
red "Error: Less than $MIN_GB gigabytes of free space in home directory; one cannot reliably compile Gwern.net with so little space, so exiting." >&2
exit 2
fi
if [ -n "$(pgrep hakyll)" ]
then
red "or Hakyll already running?"
else
set -e
# lower priority of everything we run (some of it is expensive):
renice --priority 19 --pid "$$" &>/dev/null
ionice --class 3 --pid "$$" &>/dev/null
## Parallelization: WARNING: post-2022-03 Hakyll uses parallelism which catastrophically slows down at >= # of physical cores; see
N=14
SLOW="true"
SKIP_DIRECTORIES=""
TODAY=$(date '+%F')
for ARG in "$@"; do
case "$ARG" in
--fast) SLOW="" ;;
--skip-directories) SKIP_DIRECTORIES="true" ;;
*[!0-9]*) ;; # skip non-numbers
*) N="$ARG" ;;
esac
done
s() { gwsed "$@"; }
export SLOW SKIP_DIRECTORIES N s
if [ "$SLOW" ]; then (cd ~/wiki/ && git status) || true; fi # quickly summarize pending changes
bold "Pulling infrastructure updates…"
# pull from Said Achmiz's repo, with his edits overriding mine in any conflict (`-Xtheirs`) & auto-merging with the default patch text (`--no-edit`), to make sure we have the latest JS/CSS. (This is a bit tricky because the use of versioning in the includes means we get a lot of merge conflicts, for some reason.)
(cd ./static/ && git status && timeout 5m git pull --strategy-option=theirs --no-edit --verbose 'https://gwern.obormot.net/static/.git/' master) || true
if [ "$SLOW" ]; then
bold "Executing string rewrite cleanups…" # automatically clean up some Gwern.net bad URL patterns, typos, inconsistencies, house-styles:
( set +e
## domain/URL rewrites:
s 'https://mobile.x.com' 'https://x.com'; s 'https://www.x.com' 'https://x.com'; s 'https://twitter.com/' 'https://x.com/'; s 'https://en.reddit.com/' 'https://www.reddit.com/'; s 'https://www.greaterwrong.com/posts/' 'https://www.lesswrong.com/posts'; s 'http://web.archive.org/web/' 'https://web.archive.org/web/'; s 'https://youtu.be/' 'https://www.youtube.com/watch?v='; s 'http://arxiv.org' 'https://arxiv.org'; s 'https://deepmind.com' 'https://www.deepmind.com'; s 'http://en.wikipedia.org' 'https://en.wikipedia.org'; s 'v1.full' '.full'; s 'v2.full' '.full'; s 'v3.full' '.full'; s 'v4.full' '.full'; s 'v5.full' '.full'; s 'v6.full' '.full'; s 'v7.full' '.full'; s 'v8.full' '.full'; s 'v9.full' '.full'; s '.full-text' '.full'; s '.full.full' '.full'; s '.full-text' '.full'; s '.full-text.full' '.full'; s '.full.full.full' '.full'; s '.full.full' '.full'; s '.gov/labs/pmc/articles/P' '.gov/pmc/articles/P'; s 'rjlipton.wpcomstaging.com' 'rjlipton.wordpress.com'; s 'www.super-memory.com' 'super-memory.com'; s 'https://www.bldgblog.com' 'https://bldgblog.com'; s 'https://www.clinicaltrials.gov' 'https://clinicaltrials.gov'; s 'https://arxiv.org/abs//' 'https://arxiv.org/abs/'; s 'http://paulgraham.com' 'https://paulgraham.com'; s 'http://www.paulgraham.com' 'https://paulgraham.com'; s "https://www.paulgraham.com" "https://paulgraham.com"; s 'https://www.arxiv.org/' 'https://arxiv.org/';
## NOTE: domains which are bad or unfixable are handled by a later lint. This is only for safe rewrites.
## link cruft rewrites:
s '&hl=en&oi=ao' ''; s '&hl=en' ''; s '?hl=en&' '?'; s '?hl=en' ''; s '?usp=sharing' ''; s '?via%3Dihub' ''; s '.html?pagewanted=all' '.html'; s '&feature=youtu.be' ''; s '?app=desktop&' '?'; s ':443/' '/'; s ':80/' '/'; s '?s=r' ''; s '?s=61' ''; s '?sd=pf' ''; s '?ref=The+Browser-newsletter' ''; s '?ref=thebrowser.com' ''; s '?ignored=irrelevant' ''; s '](/docs/' '](/doc/'; s 'href="/docs/' 'href="/doc/'; s '.pdf#pdf' '.pdf'; s '#fromrss' ''; s '&hl=en' ''; s '?rss=1' ''; s '/doc/statistics/decision-theory' '/doc/statistics/decision'; s '?ref=quillette.com' ''; s '?login=false' ''; s '?open=false#' '#'; s 'https://amp.theguardian.com/' 'https://theguardian.com/'; s '?wprov=sfti1' '';
stringReplace '&oi=ao' '' ./static/build/Config/Metadata/Author.hs; stringReplace '&hl=en' '' ./static/build/Config/Metadata/Author.hs; stringReplace '&oi=sra' '' ./static/build/Config/Metadata/Author.hs; stringReplace '?hl=en&' '?' ./static/build/Config/Metadata/Author.hs
## surname/entity consistency & fixing common spelling errors:
s 'EMBASE' 'Embase'; s 'Medline' 'MEDLINE'; s 'PsychINFO' 'PsycINFO'; s 'MSCOCO' 'MS COCO'; s 'Yann Le Cun' 'Yann LeCun'; s ' VQVAE' ' VQ-VAE'; s 'CIFAR 10' 'CIFAR-10'; s 'Jorges Luis Borges' 'Jorge Luis Borges'; s 'Rene Girard' 'René Girard'; s 'Anno Hideaki' 'Hideaki Anno'; s ' GPT2' ' GPT-2'; s ' Clinicaltrials.gov' ' ClinicalTrials.gov'; s ' clinicaltrials.gov' ' ClinicalTrials.gov'; s 'Dario Amodai' 'Dario Amodei'; s 'single nucleotide polymorph' 'single-nucleotide polymorph'; s 'Single Nucleotide Polymorph' 'Single-Nucleotide Polymorph'; s 'single nucleotide variant' 'single-nucleotide variant'; s ' CIFAR10' 'CIFAR-10'; s 'TyDi QA' 'TyDiQA'; s 'Türkiye' 'Turkey'; s ' Poincare' ' Poincaré'; s 'Francois de La Rochefoucauld' 'François de La Rochefoucauld'; s 'Moliere' 'Molière'; s 'behavioural genetic' 'behavioral genetic'; s ' gwern.net' ' Gwern.net'; s 'chain of thought' 'chain-of-thought'; s 'Chain Of Thought' 'Chain-Of-Thought'; s 'Chain of Thought' 'Chain-of-Thought'; s 'Chain of thought' 'Chain-of-thought'; s 'MS Marco' 'MS MARCO'; s 'MS-MARCO' 'MS MARCO'; s 'NLSY-79' 'NLSY79'; s 'NLSY-97' 'NLSY97'; s 'state of the art' 'state-of-the-art'; s 'State of the Art' 'State-of-the-Art'; s 'State of the art' 'State-of-the-art'; s 'State Of The Art' 'State-of-the-Art'; s 'Enwik8' 'enwik8'; s 'enwiki8' 'enwik8'; s 'G. M. Fahy' 'Gregory M. Fahy'; s 'Greg M. Fahy' 'Gregory M. Fahy'; s 'Gary Kasparov' 'Garry Kasparov'; s 'Fel D1' 'Fel D 1'; s 'Fel d1' 'Fel d 1'; s 'CIFAR10' 'CIFAR-10'; s 'ImageNet1k' 'ImageNet-1k'; s 'ImageNet21k' 'ImageNet-21k'; s ' Imagenet' ' ImageNet'; s ' LeGuin' ' Le Guin'; s 'DALL-E 1' 'DALL·E 1'; s 'DALL-E 2' 'DALL·E 2'; s 'DALLE-2 ' 'DALL·E 2 '; s 'DALL-E 3' 'DALL·E 3'; s 'FLAN-PALM' 'Flan-PaLM'; s 'GPT-4V' 'GPT-4-V'; s 'GPT-4 V' 'GPT-4-V'; s ' GPT4' ' GPT-4'; s 'drop cap' 'dropcap'; s 'Drop cap' 'Dropcap'; s 'Drop Cap' 'Dropcap'; s 'R.A. Fisher' 'R. A. Fisher'; s 'Larry Sumners' 'Larry Summers'; s ' auto-encoder' ' autoencoder'; s 'Auto-Encoder' 'Autoencoder'; s ' GPT3' ' GPT-3' ; s ' GPT4' ' GPT-4'; s 'J.R.R. Tolkien' 'J. R. R. Tolkien'; s 'F.D.A.' 'FDA'; s 'C.D.C.' 'CDC'; s 'F.B.I.' 'FBI'; s 'C.I.A.' 'CIA'; s ' Onlyfans' ' OnlyFans'; s ' A.I.' ' AI'; s ' Juergen' ' Jürgen'; s ' Godel' ' Gödel'; s ' Goedel' ' Gödel'; s 'Bryne Hobart' 'Byrne Hobart'; s 'Saigyo' 'Saigyō'; s 'John Wentsworth' 'John Wentworth'; s ' othre' ' other'; s 'edtorial' 'editorial'; s ' Javascript' ' JavaScript'; s ' brandnew' ' brand-new'; s ' Erdos' ' Erdős'; s 'Ursula K Le Guin' 'Ursula K. Le Guin';
## abbreviation consistency:
s '(ie,' '(ie.'; s '(ie ' '(ie. '; s 'i.e.,' 'ie.'; s 'ie., ' 'ie. '; s '(i.e.' '(ie.'; s '(eg, ' '(eg. '; s ' eg ' ' eg. '; s '(eg ' '(eg. '; s '[eg ' '[eg. '; s '[Eg ' '[eg. '; s 'e.g. ' 'eg. '; s ' e.g. ' ' eg. '; s 'e.g.,' 'eg.'; s 'eg.,' 'eg.'; s 'E.g.,' 'Eg.'; s '(cf ' '(cf. '; s ' cf ' ' cf. '; s 'c.f., ' 'cf. '; s 'v.s.' 'versus';
s ' etc ' ' etc. '; s ' etc)' ' etc.)'; s ' etc,' ' etc.,'; s ' etc]' ' etc.]'; s ' etc’' ' etc.’'; s ' etc---' ' etc.---'; s ' etc|' ' etc.|'; s ' etc?' ' etc.?'; s ' etc;' ' etc.;'; s ' etc:' ' etc.:'; s ' etc"' ' etc."'; s ' etc[' ' etc.['; s " etc'" " etc.'"; s ' etc!' ' etc.!'; s 'etc
' 'etc.'; s 'etc' 'etc.'; s ' etc—' ' etc.—'; s ' etc”' ' .etc”'; s 'etc' 'etc.'
s ' Feb ' ' February '; s ' Aug ' ' August '; s ', Jr.' ' Junior'; s ' Jr.' ' Junior'; s ', Junior' ' Junior';
s 'Th' 'th'; s ' 20th' ' 20th'; s ' 21st' ' 21st';
s ',”' '”,'; s ",’" "’,"; s '(vs. ' '(versus '; s ' vs. ' ' versus '; s ' Vs. ' ' Versus '; s 'best-of-N' 'best-of-n'; s ' Best-Of-N ' ' Best-of-n '; s 'Best-of-n ' 'Best-of-n '; s ' best-of-n' ' best-of-n'; s ' best-of-N' ' best-of-n';
### NOTE: Not safe to do site-wide with `gwsed` because it stomps all over R transcripts where quartiles
### are often reported in summaries like '1st'; we can do it safely for GTX because no R sessions there (for now):
stringReplace 'St' 'st' ./metadata/*.gtx; stringReplace 'Nd' 'nd' ./metadata/*.gtx; stringReplace 'Rd' 'rd' ./metadata/*.gtx; stringReplace ' 1st ' ' 1st ' ./metadata/*.gtx; stringReplace ' 2nd' ' 2nd' ./metadata/*.gtx; stringReplace ' 3rd' ' 3rd' ./metadata/*.gtx; stringReplace ' 4th' ' 4th' ./metadata/*.gtx;
## spelling errors:
s 'border colly' 'border collie'; s 'genomewide' 'genome-wide'; s 'regularise' 'regularize'; s ' residualis' ' residualiz'; s 'endelian randomisation' 'endelian randomization'; s 'mendelian randomization' 'Mendelian Randomization'; s 'Mendelian randomization' 'Mendelian Randomization'; s 'canalization' 'canalisation'; s 'Statistical significance' 'Statistical-significance'; s 'Statistical Significance' 'Statistical-Significance'; s 'statistical significance' 'statistical-significance'; s ' longstanding' ' long-standing'; s 'utilise' 'utilize'; s 'facebookok' 'facebook'; s 'Tartarian' 'Tatarian'; s 'tartarian' 'tatarian'; s ' an One' ' a One'; s ' an one' ' a one'; s '
he ' '
He '; s ' lik ' ' like '; s ' Behaviour ' ' Behavior '; s ' behaviour ' ' behavior '; s ' anaesthesia' ' anesthesia'; s ' Modelling' ' Modeling'; s ' modelling' ' modeling'; s ' colour' ' color'; s ' Colour' ' Color'; s 'multicentre' 'multicenter'; s 'Multicentre' 'Multicenter'; s ' Cluster-Randomis' ' Cluster-Randomiz'; s ' Non-Randomis' ' Non-Randomiz'; s ' Non-randomised' ' Non-randomized'; s ' Randomis' ' Randomiz'; s ' cluster-randomis' ' cluster-randomiz'; s ' non-randomis' ' non-randomiz'; s ' non-randomised' ' non-randomized'; s ' nonrandomised' ' non-randomised'; s ' quasi-randomised' ' quasi-randomized'; s ' randomis' ' randomiz'; s 'categoris' 'categoriz'; s ' ageing' ' aging'; s ' Ageing' ' Aging'; s 'Likert-scale' 'Likert scale'; s 'discussiom' 'discussion'; s ' Homogeneous' ' Homogenous'; s ' homogeneous' ' homogenous'; s ' Non-Homogeneous' ' Non-Homogenous'; s ' non-homogeneous' ' non-homogenous'; s 'Homogeneous:' 'Homogenous:'; s '“homogeneous”' '“homogenous”'; s ' ancestry-homogeneous' ' ancestry-homogenous'; s ' inhomogeneous' ' inhomogenous'; s ' continuee ' ' continue '; s ' A LLM' ' An LLM'; s ' a LLM' ' an LLM';
## citation consistency:
s ']^[' '] ^['; s 'et. al.' 'et al'; s 'et al. (' 'et al ('; s ' et al. 1' ' et al 1'; s ' et al. 2' ' et al 2'; s ' et al., ' ' et al '; s 'et al., ' 'et al '; s ' et. al ' ' et al ';
s 'pg 1' 'pg1'; s 'pg 2' 'pg2'; s 'pg 3' 'pg3'; s 'pg 4' 'pg4'; s 'pg 5' 'pg5'; s 'pg 6' 'pg6'; s 'pg 7' 'pg7'; s 'pg 8' 'pg8'; s 'pg 9' 'pg9';
### WARNING: when using `+` in sed, by default, it is treated as an ordinary literal. It MUST be escaped to act as a regexp! Whereas in `grep --extended-regexp`, it's the opposite. So remember: `\+` in sed, and `+` in grep.
### WARNING: remember that `sed -i` modifies the last-modified timestamp of all files it runs on, even when the file was not, in fact, modified!
for file in $(find . -type f -name "*.md" -or -name "*.gtx"); do
if grep --extended-regexp --quiet "[A-Z][a-z]+ et al \([1-2][0-9]{3}[a-z]?\)" "$file"; then
sed -i -e 's/\([A-Z][a-z]\+\) et al (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 et al \2/g' "$file"
fi
if grep --extended-regexp --quiet "[A-Z][a-z]+ and [A-Z][a-z]+ \([1-2][0-9]{3}[a-z]?\)" "$file"; then
sed -i -e 's/\([A-Z][a-z]\+\) and \([A-Z][a-z]\+\) (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 \& \2 \3/g' "$file"
fi
done
## anchor errors:
s '#allen#allen' '#allen'; s '#deepmind#deepmind' '#deepmind'; s '&org=deepmind&org=deepmind' '&org=deepmind'; s '#nvidia#nvidia' '#nvidia'; s '#openai#openai' '#openai'; s '#google#google' '#google'; s '#uber#uber' '#uber';
## HTML/Markdown formatting:
s '