#!/bin/bash # Author: Gwern Branwen # Date: 2016-10-01 # When: Time-stamp: "2024-04-24 22:57:42 gwern" # License: CC-0 # # sync-gwern.net.sh: shell script which automates a full build and sync of Gwern.net. A full build is intricate, and requires several passes like generating link-bibliographies/tag-directories, running two kinds of syntax-highlighting, stripping cruft etc. # # This script automates all of that: it cleans up, compiles a hakyll binary for faster compilation, # generates a sitemap XML file, optimizes the MathJax use, checks for many kinds of errors, uploads, # and cleans up. # key dependencies: GHC, Hakyll, emacs, curl, tidy (HTML5 version), urlencode # ('gridsite-clients' package), linkchecker, fdupes, ImageMagick, exiftool, mathjax-node-page (eg. # `npm i -g mathjax-node-page`), parallel, xargs, php7… . ./static/build/bash.sh if ! [[ -n $(command -v ghc) && -n $(command -v git) && -n $(command -v rsync) && -n $(command -v curl) && -n $(command -v ping) && \ -n $(command -v tidy) && -n $(command -v linkchecker) && -n $(command -v du) && -n $(command -v rm) && -n $(command -v find) && \ -n $(command -v fdupes) && -n $(command -v urlencode) && -n $(command -v sed) && -n $(command -v parallel) && -n $(command -v xargs) && \ -n $(command -v file) && -n $(command -v exiftool) && -n $(command -v identify) && -n $(command -v pdftotext) && \ -n $(command -v ~/src/node_modules/mathjax-node-page/bin/mjpage) && -n $(command -v static/build/link-extractor.hs) && \ -n $(command -v static/build/anchor-checker.php) && -n $(command -v php) && -n $(command -v static/build/generateDirectory.hs) && \ -n $(command -v static/build/generateLinkBibliography.hs) && \ -n $(command -v static/build/generateBacklinks.hs) && \ -n $(command -v static/build/generateSimilarLinks.hs) && \ -n $(command -v gifsicle) && \ -n $(command -v libreoffice) && \ -n $(command -v elinks) ]] && [ -z "$(pgrep hakyll)" ]; then red "Dependencies missing or Hakyll already running?" else set -e # lower priority of everything we run (some of it is expensive): renice --priority 19 --pid "$$" &>/dev/null ionice --class 3 --pid "$$" &>/dev/null ## Parallelization: WARNING: post-2022-03 Hakyll uses parallelism which catastrophically slows down at >= # of physical cores; see N=28 SLOW="true" SKIP_DIRECTORIES="" TODAY=$(date '+%F') for ARG in "$@"; do case "$ARG" in --fast) SLOW="" ;; --skip-directories) SKIP_DIRECTORIES="true" ;; *[!0-9]*) ;; # skip non-numbers *) N="$ARG" ;; esac done export SLOW SKIP_DIRECTORIES N if [ "$SLOW" ]; then (cd ~/wiki/ && git status) || true; bold "Checking metadata…" pkill checkMetadata || true rm ~/METADATA.txt &> /dev/null || true TMP_CHECK=$(mktemp /tmp/"XXXXX.txt"); ./static/build/checkMetadata >"$TMP_CHECK" 2>&1 && mv "$TMP_CHECK" ~/METADATA.txt || true & fi & bold "Pulling infrastructure updates…" # pull from Said Achmiz's repo, with his edits overriding mine in any conflict (`-Xtheirs`) & auto-merging with the default patch text (`--no-edit`), to make sure we have the latest JS/CSS. (This is a bit tricky because the use of versioning in the includes means we get a lot of merge conflicts, for some reason.) (cd ./static/ && git status && timeout 10m git pull -Xtheirs --no-edit --verbose 'https://gwern.obormot.net/static/.git/' master) || true if [ "$SLOW" ]; then bold "Executing string rewrite cleanups…" # automatically clean up some Gwern.net bad URL patterns, typos, inconsistencies, house-styles: ( s() { gwsed "$@"; } ## domain rewrites: s 'https://mobile.twitter.com' 'https://twitter.com'; s 'https://www.twitter.com' 'https://twitter.com'; s 'https://x.com/' 'https://twitter.com/'; s 'https://en.reddit.com/' 'https://www.reddit.com/'; s 'https://www.greaterwrong.com/posts/' 'https://www.lesswrong.com/posts'; s 'http://web.archive.org/web/' 'https://web.archive.org/web/'; s 'https://youtu.be/' 'https://www.youtube.com/watch?v='; s 'http://arxiv.org' 'https://arxiv.org'; s 'https://deepmind.com' 'https://www.deepmind.com'; s 'http://en.wikipedia.org' 'https://en.wikipedia.org'; s 'v1.full' '.full'; s 'v2.full' '.full'; s 'v3.full' '.full'; s 'v4.full' '.full'; s 'v5.full' '.full'; s 'v6.full' '.full'; s 'v7.full' '.full'; s 'v8.full' '.full'; s 'v9.full' '.full'; s '.full-text' '.full'; s '.full.full' '.full'; s '.full-text' '.full'; s '.full-text.full' '.full'; s '.full.full.full' '.full'; s '.full.full' '.full'; s '.gov/labs/pmc/articles/P' '.gov/pmc/articles/P'; s 'rjlipton.wpcomstaging.com' 'rjlipton.wordpress.com'; s 'www.super-memory.com' 'super-memory.com'; s 'https://www.bldgblog.com' 'https://bldgblog.com'; s 'https://www.clinicaltrials.gov' 'https://clinicaltrials.gov'; s 'https://arxiv.org/abs//' 'https://arxiv.org/abs/'; s 'http://paulgraham.com' 'https://paulgraham.com'; s 'http://www.paulgraham.com' 'https://paulgraham.com'; s "https://www.paulgraham.com" "https://paulgraham.com"; s 'https://scribe.rip' 'https://freedium.cfd'; ## NOTE: domains which are bad or unfixable are handled by a later lint. This is only for safe rewrites. ## link cruft rewrites: s '&hl=en' ''; s '?hl=en&' '?'; s '?hl=en' ''; s '?usp=sharing' ''; s '?via%3Dihub' ''; s '.html?pagewanted=all' '.html'; s '&feature=youtu.be' ''; s ':443/' '/'; s ':80/' '/'; s '?s=r' ''; s '?s=61' ''; s '?sd=pf' ''; s '?ref=The+Browser-newsletter' ''; s '?ref=thebrowser.com' ''; s '?ignored=irrelevant' ''; s '](/docs/' '](/doc/'; s 'href="/docs/' 'href="/doc/'; s '.pdf#pdf' '.pdf'; s '#fromrss' ''; s '&hl=en' ''; s '?rss=1' ''; s '/doc/statistics/decision-theory' '/doc/statistics/decision'; s '?ref=quillette.com' ''; s '?login=false' ''; ## name/entity consistency: s 'EMBASE' 'Embase'; s 'Medline' 'MEDLINE'; s 'PsychINFO' 'PsycINFO'; s 'MSCOCO' 'MS COCO'; s 'Yann Le Cun' 'Yann LeCun'; s ' VQVAE' ' VQ-VAE'; s 'CIFAR 10' 'CIFAR-10'; s 'Jorges Luis Borges' 'Jorge Luis Borges'; s 'Rene Girard' 'René Girard'; s 'Anno Hideaki' 'Hideaki Anno'; s ' GPT2' ' GPT-2'; s ' Clinicaltrials.gov' ' ClinicalTrials.gov'; s ' clinicaltrials.gov' ' ClinicalTrials.gov'; s 'Dario Amodai' 'Dario Amodei'; s 'single nucleotide polymorph' 'single-nucleotide polymorph'; s 'Single Nucleotide Polymorph' 'Single-Nucleotide Polymorph'; s 'single nucleotide variant' 'single-nucleotide variant'; s ' CIFAR10' 'CIFAR-10'; s 'TyDi QA' 'TyDiQA'; s 'Türkiye' 'Turkey'; s ' Poincare' ' Poincaré'; s 'Francois de La Rochefoucauld' 'François de La Rochefoucauld'; s 'Moliere' 'Molière'; s 'behavioural genetic' 'behavioral genetic'; s ' gwern.net' ' Gwern.net'; s 'chain of thought' 'chain-of-thought'; s 'Chain Of Thought' 'Chain-Of-Thought'; s 'Chain of Thought' 'Chain-of-Thought'; s 'Chain of thought' 'Chain-of-thought'; s 'MS Marco' 'MS MARCO'; s 'MS-MARCO' 'MS MARCO'; s 'NLSY-79' 'NLSY79'; s 'NLSY-97' 'NLSY97'; s 'state of the art' 'state-of-the-art'; s 'State of the Art' 'State-of-the-Art'; s 'Enwik8' 'enwik8'; s 'G. M. Fahy' 'Gregory M. Fahy'; s 'Greg M. Fahy' 'Gregory M. Fahy'; s 'Gary Kasparov' 'Garry Kasparov'; s 'Fel D1' 'Fel D 1'; s 'Fel d1' 'Fel d 1'; s 'CIFAR10' 'CIFAR-10'; s 'ImageNet1k' 'ImageNet-1k'; s 'ImageNet21k' 'ImageNet-21k'; s ' LeGuin' ' Le Guin'; s 'DALL-E 1' 'DALL·E 1'; s 'DALL-E 2' 'DALL·E 2'; s 'DALL-E 3' 'DALL·E 3'; s 'FLAN-PALM' 'Flan-PaLM'; s 'GPT-4V' 'GPT-4-V'; s 'GPT-4 V' 'GPT-4-V'; s ' GPT4' ' GPT-4'; s 'drop cap' 'dropcap'; s 'Drop cap' 'Dropcap'; s 'Drop Cap' 'Dropcap'; s 'R.A. Fisher' 'R. A. Fisher'; s 'Larry Sumners' 'Larry Summers'; ## abbreviation consistency: s '(ie,' '(ie.'; s '(ie ' '(ie. '; s 'i.e.,' 'ie.'; s 'ie., ' 'ie. '; s '(i.e.' '(ie.'; s '(eg, ' '(eg. '; s ' eg ' ' eg. '; s '(eg ' '(eg. '; s '[eg ' '[eg. '; s 'e.g. ' 'eg. '; s ' e.g. ' ' eg. '; s 'e.g.,' 'eg.'; s 'eg.,' 'eg.'; s 'E.g.,' 'Eg.'; s '(cf ' '(cf. '; s ' cf ' ' cf. '; s ' Feb ' ' February '; s ' Aug ' ' August '; s ', Jr.' ' Junior'; s ' Jr.' ' Junior'; s ', Junior' ' Junior'; s 'Th' 'th'; s 'St' 'st'; s 'Nd' 'nd'; s 'Rd' 'rd'; s ',”' '”,'; s ",’" "’,"; ## spelling errors: s 'border colly' 'border collie'; s 'genomewide' 'genome-wide'; s 'regularise' 'regularize'; s ' residualis' ' residualiz'; s 'endelian randomisation' 'endelian randomization'; s 'mendelian randomization' 'Mendelian Randomization'; s 'Mendelian randomization' 'Mendelian Randomization'; s 'canalization' 'canalisation'; s 'Statistical significance' 'Statistical-significance'; s 'Statistical Significance' 'Statistical-Significance'; s 'statistical significance' 'statistical-significance'; s ' longstanding' ' long-standing'; s 'utilise' 'utilize'; s 'facebookok' 'facebook'; s 'Tartarian' 'Tatarian'; s 'tartarian' 'tatarian'; s ' an One' ' a One'; s ' an one' ' a one'; s '

he ' '

He '; s ' lik ' ' like '; s ' Behaviour ' ' Behavior '; s ' behaviour ' ' behavior ' ## citation consistency: s ']^[' '] ^['; s 'et. al.' 'et al'; s 'et al. (' 'et al ('; s ' et al. 1' ' et al 1'; s ' et al. 2' ' et al 2'; s ' et al., ' ' et al '; s 'et al., ' 'et al '; ### WARNING: when using `+` in sed, by default, it is treated as an ordinary literal. It MUST be escaped to act as a regexp! Whereas in `grep -E`, it's the opposite. So remember: `\+` in sed, and `+` in grep. ### WARNING: remember that `sed -i` modifies the last-modified timestamp of all files it runs on, even when the file was not, in fact, modified! for file in $(find . -name "*.md" -or -name "*.gtx"); do if grep -qE "[A-Z][a-z]+ et al \([1-2][0-9]{3}[a-z]?\)" "$file"; then sed -i -e 's/\([A-Z][a-z]\+\) et al (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 et al \2/g' "$file" fi if grep -qE "[A-Z][a-z]+ and [A-Z][a-z]+ \([1-2][0-9]{3}[a-z]?\)" "$file"; then sed -i -e 's/\([A-Z][a-z]\+\) and \([A-Z][a-z]\+\) (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 \& \2 \3/g' "$file" fi done ## anchor errors: s '#allen#allen' '#allen'; s '#deepmind#deepmind' '#deepmind'; s '&org=deepmind&org=deepmind' '&org=deepmind'; s '#nvidia#nvidia' '#nvidia'; s '#openai#openai' '#openai'; s '#google#google' '#google'; s '#uber#uber' '#uber'; ## HTML/Markdown formatting: s '

' '

'; s ' _n_s' ' ns'; s ' (n = ' ' (n = '; s ' (N = ' ' (n = '; s ' de novo ' ' de novo '; s ' De Novo ' ' De Novo '; s 'backlinks-not' 'backlink-not'; s ',' ','; s ': ' ': '; s ';' ';'; s ' <Xs'; s ' _r_s' ' rs'; s ''; s '# External links' '# External Links'; s '# See also' '# See Also'; s '"abstract-collapse abstract"' '"abstract abstract-collapse"'; s "‐" "-"; s 'class="link-auto"' ''; s '𝑂(' '𝒪('; s ' and ' ' & '; s '' ''; s '' ''; s 'augmentation,' 'augmentation,'; s 'Bitcoin,' 'Bitcoin,'; s 'class="invertible"' 'class="invert"'; s '”>' '">'; s '
' '
'; s '
' '
'; s ' id="cb1"' ''; s ' id="cb2"' ''; s ' id="cb3"' ''; s ' id="cb4"' ''; s '.svg-530px.jpg' '.svg'; s ' (”' ' (“'; s '’s' '’s'; s '-530px.jpg' ''; s '-768px.png' ''; s '-768px.jpg' ''; s '—-' '—'; s 'collapse-summary' 'abstract-collapse'; s 'collapse-abstract' 'abstract-collapse'; s 'href="ttp' 'href="http'; s '\xmlpi{\\}' ''; s '°C' '℃'; s '° C' '℃'; s '°F' '℉'; s '° F' '℉'; s '℉ahrenheit' '℉'; s '℃elsius' '℃'; s ' ℃' '℃'; s ' ℉' '℉'; s 'marginnnote' 'marginnote'; s '
' ''; s '
' ''; s '::' '
:'; s '](//' '[(/'; s '{.full-width' '{.width-full'; s '

' '
'; s '](/home/gwern/wiki/' '](/'; s '' '’.

'; s 'Cite-Author' 'cite-author'; s 'Cite-Date' 'cite-date'; s 'Cite-Joiner' 'cite-joiner'; s 'class="Cite' 'class="cite'; s 'Logotype-Tex' 'logotype-tex'; s '

' '

'; s '’ ”' '’ ”'; s ' ”' ' “'; s '[("doi","")]' ''; s '>/a>' '
'; s 'href="W!"' 'href="!W"'; s 'class="Logotype-Tex"' 'class="logotype-tex"'; s 'Class="Logotype-Tex"' 'class="logotype-tex"'; s 'pandoc 3.1.1 (2023-03-05), so can remove these two rewrites once I upgrade past that: s 'class="odd odd' 'class="odd'; s 'class="even even' 'class="even'; s '  ' ' '; s '​ ' ' '; ) &> /dev/null & sed -i -e 's/ data-link-?[Tt]ags="[a-z0-9 \/-]\+">/>/' ./metadata/*.gtx; fi bold "Compiling…" cd ./static/build WARNINGS="" if [ "$SLOW" ]; then WARNINGS="-Wall -Werror"; fi compile () { ghc -O2 $WARNINGS -rtsopts -threaded --make "$@"; } compile hakyll.hs if [ -z "$SKIP_DIRECTORIES" ]; then compile generateLinkBibliography.hs compile generateDirectory.hs; fi compile preprocess-markdown.hs compile guessTag.hs & compile changeTag.hs & compile checkMetadata.hs & ## NOTE: generateSimilarLinks.hs & link-suggester.hs are done at midnight by a cron job because ## they are too slow to run during a regular site build & don't need to be super-up-to-date ## anyway cd ../../ # cleanup: rm --recursive --force -- ./_cache/ ./_site/ if [ "$SLOW" ]; then bold "Checking embeddings database…" ghci -istatic/build/ ./static/build/GenerateSimilar.hs -e 'e <- readEmbeddings' &>/dev/null # duplicates a later check but if we have a fatal link error, we'd rather find out now rather than 30 minutes later while generating annotations: λ(){ gf -e 'href=""' -e 'href="!W">' -e "href='!W'>" -- ./metadata/*.gtx || true; } wrap λ "Malformed empty link in annotations?" # another early fatal check: if there is a Markdown file 'foo.md' and also a subdirectory 'foo/' in the same directory, then this will result in, later, a fatal error when one tries to compile 'foo.md' → 'foo' (the HTML file) but 'foo' (the directory) already exists. # Check if any files collide with directories of the same name (without the .md extension). # Usage: find_colliding_files [path] function find_colliding_files() { # GPT-3 written: set -euo pipefail path="${1:-.}" find "$path" -depth -type f -name "*.md" -exec sh -c ' for file do path="$(dirname "$file")/$(basename "$file" ".md")" if [ -e "$path" ] && [ ! -L "$path" ]; then if [ -d "$path" ]; then printf "Fatal error: Directory exists with the same name as file %s\n" "$file" >&2 exit 1 else printf "Fatal error: File exists with the same name as file %s\n" "$file" >&2 exit 1 fi fi done' sh {} + } find_colliding_files ./ # We update the linkSuggestions.el in a cron job because too expensive, and vastly slows down build. # Update the directory listing index pages: there are a number of directories we want to avoid, # like the various mirrors or JS projects, or directories just of data like CSVs, or dumps of # docs, so we'll blacklist those: DIRECTORY_TAGS="$(find doc/ fiction/ haskell/ newsletter/ nootropic/ note/ review/ zeo/ -type d \ | sort | gfv -e 'doc/www' -e 'doc/rotten.com' -e 'doc/genetics/selection/www.mountimprobable.com' \ -e 'doc/biology/2000-iapac-norvir' -e 'doc/gwern.net-gitstats' -e 'doc/reinforcement-learning/armstrong-controlproblem' \ -e 'doc/statistics/order/beanmachine-multistage' -e 'doc/personal/2011-gwern-yourmorals.org/' \ -e 'confidential/' -e 'private/' -e 'secret/' -e 'newest/')" if [ -z "$SKIP_DIRECTORIES" ]; then bold "Updating link bibliographies…" ./static/build/generateLinkBibliography +RTS -N"$N" -RTS || true # we want to generate all directories first before running Hakyll in case a new tag was created bold "Building directory indexes…" ./static/build/generateDirectory +RTS -N"$N" -RTS $DIRECTORY_TAGS fi fi bold "Check & update VCS…" (ping -q -c 5 google.com &> /dev/null && cd ./static/ && git status; git pull; git push --verbose &) || true # Cleanup pre: rm --recursive --force ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks || true cd ~/wiki/ # go to site root bold "Building site…" # make sure all videos have 'poster' preview images: for VIDEO in $(find . -type f -name "*.mp4" -or -name "*.webm" -or -name "*.avi" | gfv "doc/www/"); do # we skip posters for videos in /doc/www/* archives from split archives because nothing sets a poster on them, so just a waste of space POSTER="$VIDEO-poster.jpg"; if [ ! -f "$POSTER" ]; then echo "Generating poster image for $VIDEO…" # Problem: embedded videos (e.g. https://gwern.net/lorem-multimedia#video ) all look like generic small black rectangles. User has no idea what it is until they click to begin download the (possibly huge) video file. This also causes layout shift as the `