#!/bin/bash # Author: Gwern Branwen # Date: 2016-10-01 # When: Time-stamp: "2023-03-22 11:22:53 gwern" # License: CC-0 # # sync-gwern.net.sh: shell script which automates a full build and sync of Gwern.net. A simple build # can be done using 'runghc hakyll.hs build', but that is slow, semi-error-prone (did you # remember to delete all intermediates?), and does no sanity checks or optimizations like compiling # the MathJax to static CSS/fonts (avoiding multi-second JS delays). # # This script automates all of that: it cleans up, compiles a hakyll binary for faster compilation, # generates a sitemap XML file, optimizes the MathJax use, checks for many kinds of errors, uploads, # and cleans up. bold () { echo -e "\033[1m$@\033[0m"; } red () { echo -e "\e[41m$@\e[0m"; } ## function to wrap checks and print red-highlighted warning if non-zero output (self-documenting): wrap () { OUTPUT=$($1 2>&1) WARN="$2" if [ -n "$OUTPUT" ]; then echo -n "Begin: "; red "$WARN"; echo -e "$OUTPUT"; echo -n "End: "; red "$WARN"; fi; } ge () { grep -E --color=always "$@"; } gf () { grep -F --color=always "$@"; } # key dependencies: GHC, Hakyll, s3cmd, emacs, curl, tidy (HTML5 version), urlencode # ('gridsite-clients' package), linkchecker, fdupes, ImageMagick, exiftool, mathjax-node-page (eg. # `npm i -g mathjax-node-page`), parallel, xargs, php7… if ! [[ -n $(command -v ghc) && -n $(command -v git) && -n $(command -v rsync) && -n $(command -v curl) && -n $(command -v ping) && \ -n $(command -v tidy) && -n $(command -v linkchecker) && -n $(command -v du) && -n $(command -v rm) && -n $(command -v find) && \ -n $(command -v fdupes) && -n $(command -v urlencode) && -n $(command -v sed) && -n $(command -v parallel) && -n $(command -v xargs) && \ -n $(command -v file) && -n $(command -v exiftool) && -n $(command -v identify) && -n $(command -v pdftotext) && \ -n $(command -v ~/src/node_modules/mathjax-node-page/bin/mjpage) && -n $(command -v static/build/link-extractor.hs) && \ -n $(command -v static/build/anchor-checker.php) && -n $(command -v php) && -n $(command -v static/build/generateDirectory.hs) && \ -n $(command -v static/build/generateLinkBibliography.hs) && \ -n $(command -v static/build/generateBacklinks.hs) && \ -n $(command -v static/build/generateSimilarLinks.hs) ]] && \ [ -z "$(pgrep hakyll)" ]; then red "Dependencies missing or Hakyll already running?" else set -e # lower priority of everything we run (some of it is expensive): renice --priority 19 --pid "$$" &>/dev/null ionice --class 3 --pid "$$" &>/dev/null ## Parallelization: WARNING: post-2022-03 Hakyll uses parallelism which catastrophically slows down at >= # of physical cores; see N="30" # "$(if [ ${#} == 0 ]; then echo 31; else echo "$1"; fi)" if [ "$1" == "--slow" ]; then export SLOW="--slow"; else SLOW=""; fi if [ "$SLOW" ]; then (cd ~/wiki/ && git status) || true; fi & bold "Pulling infrastructure updates…" (cd ./static/ && git status && timeout 10m git pull --verbose 'https://gwern.obormot.net/static/.git/' master) || true if [ "$SLOW" ]; then bold "Executing string rewrite cleanups…" # automatically clean up some Gwern.net bad URL patterns, typos, inconsistencies, house-styles: ( s() { gwsed "$@"; } ## domain rewrites: s 'https://mobile.twitter.com' 'https://twitter.com'; s 'https://twitter.com/' 'https://nitter.moomoo.me/'; s 'https://mobile.twitter.com/' 'https://nitter.moomoo.me/'; s 'https://www.twitter.com/' 'https://nitter.moomoo.me/'; s 'https://www.reddit.com/r/' 'https://old.reddit.com/r/'; s 'https://en.m.wikipedia.org/' 'https://en.wikipedia.org/'; s 'https://www.greaterwrong.com/posts/' 'https://www.lesswrong.com/posts'; s 'http://web.archive.org/web/' 'https://web.archive.org/web/'; s 'https://youtu.be/' 'https://www.youtube.com/watch?v='; s 'http://arxiv.org' 'https://arxiv.org'; s 'https://deepmind.com' 'https://www.deepmind.com'; s 'http://en.wikipedia.org' 'https://en.wikipedia.org'; s 'v1.full' '.full'; s 'v2.full' '.full'; s 'v3.full' '.full'; s 'v4.full' '.full'; s 'v5.full' '.full'; s 'v6.full' '.full'; s 'v7.full' '.full'; s 'v8.full' '.full'; s 'v9.full' '.full'; s '.full-text' '.full'; s '.full.full' '.full'; s '.full-text' '.full'; s '.full-text.full' '.full'; s '.full.full.full' '.full'; s '.full.full' '.full'; s '.gov/labs/pmc/articles/P' '.gov/pmc/articles/P'; s 'rjlipton.wpcomstaging.com' 'rjlipton.wordpress.com'; s 'www.super-memory.com' 'super-memory.com'; s 'https://www.bldgblog.com' 'https://bldgblog.com'; s 'https://www.clinicaltrials.gov' 'https://clinicaltrials.gov' ## link cruft rewrites: s '&hl=en' ''; s '?hl=en&' '?'; s '?hl=en' ''; s '?usp=sharing' ''; s '?via%3Dihub' ''; s '.html?pagewanted=all' '.html'; s '&feature=youtu.be' ''; s ':443/' '/'; s ':80/' '/'; s '?s=r' ''; s '?sd=pf' ''; s '?ref=The+Browser-newsletter' ''; s '?ignored=irrelevant' ''; s '](/docs/' '](/doc/'; s 'href="/docs/' 'href="/doc/'; s '.pdf#pdf' '.pdf'; ## name/entity consistency: s 'EMBASE' 'Embase'; s 'Medline' 'MEDLINE'; s 'PsychINFO' 'PsycINFO'; s 'MSCOCO' 'MS COCO'; s 'Yann Le Cun' 'Yann LeCun'; s ' VQVAE' ' VQ-VAE'; s 'CIFAR 10' 'CIFAR-10'; s 'Jorges Luis Borges' 'Jorge Luis Borges'; s 'Rene Girard' 'René Girard'; s 'Anno Hideaki' 'Hideaki Anno'; s ' GPT2' ' GPT-2'; s ' Clinicaltrials.gov' ' ClinicalTrials.gov'; s ' clinicaltrials.gov' ' ClinicalTrials.gov'; s 'Dario Amodai' 'Dario Amodei'; s 'single nucleotide polymorph' 'single-nucleotide polymorph'; s 'Single Nucleotide Polymorph' 'Single-Nucleotide Polymorph'; s 'single nucleotide variant' 'single-nucleotide variant'; s ' CIFAR10' 'CIFAR-10'; s 'TyDi QA' 'TyDiQA'; s 'Türkiye' 'Turkey'; s ' Poincare' ' Poincaré'; s 'Francois de La Rochefoucauld' 'François de La Rochefoucauld'; s 'Moliere' 'Molière'; s 'behavioural genetic' 'behavioral genetic'; s ' gwern.net' ' Gwern.net'; ## abbreviation consistency: s '(ie,' '(ie.'; s '(ie ' '(ie. '; s '(i.e.,' '(ie.'; s 'ie., ' 'ie. '; s '(i.e.' '(ie.'; s '(eg, ' '(eg. '; s ' eg ' ' eg. '; s '(eg ' '(eg. '; s '[eg ' '[eg. '; s 'e.g. ' 'eg. '; s ' e.g. ' ' eg. '; s 'e.g.,' 'eg.'; s 'eg.,' 'eg.'; s '(cf ' '(cf. '; s ' cf ' ' cf. '; s ' Feb ' ' February '; s ' Aug ' ' August '; s ', Jr.' ' Junior'; s ' Jr.' ' Junior'; s ', Junior' ' Junior'; s 'Th' 'th'; s 'St' 'st'; s 'Nd' 'nd'; s 'Rd' 'rd'; s ',”' '”,'; s ",’" "’,"; ## spelling errors: s 'border colly' 'border collie'; s 'genomewide' 'genome-wide'; s 'regularise' 'regularize'; s ' residualis' ' residualiz'; s 'endelian randomisation' 'endelian randomization'; s 'mendelian randomization' 'Mendelian Randomization'; s 'Mendelian randomization' 'Mendelian Randomization'; s 'canalization' 'canalisation'; s 'Statistical significance' 'Statistical-significance'; s 'Statistical Significance' 'Statistical-Significance'; s 'statistical significance' 'statistical-significance'; s ' longstanding' ' long-standing'; s 'utilise' 'utilize'; s 'facebookok' 'facebook'; s 'Tartarian' 'Tatarian'; s 'tartarian' 'tatarian'; s ' an One' ' a One'; s ' an one' ' a one' ## citation consistency: s ']^[' '] ^['; s 'et al. (' 'et al ('; s ' et al. 1' ' et al 1'; s ' et al. 2' ' et al 2'; s ' et al., ' ' et al '; s 'et al., ' 'et al '; sed -i -e 's/\([A-Z][a-z]\+\) et al (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 et al \2/g' metadata/*.yaml `find . -name "*.page" -or -name "*.yaml"`; sed -i -e 's/\([A-Z][a-z]\+\) and \([A-Z][a-z]\+\) (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 \& \2 \3/g' `find . -name "*.page" -or -name "*.yaml"`; ## anchor errors: s '#allen#allen' '#allen'; s '#deepmind#deepmind' '#deepmind'; s '&org=deepmind&org=deepmind' '&org=deepmind'; s '#nvidia#nvidia' '#nvidia'; s '#openai#openai' '#openai'; s '#google#google' '#google'; s '#uber#uber' '#uber'; ## HTML/Markdown formatting: s '

' '

'; s ' _n_s' ' ns'; s ' (n = ' ' (n = '; s ' (N = ' ' (n = '; s ' de novo ' ' de novo '; s ' De Novo ' ' De Novo '; s 'backlinks-not' 'backlink-not'; s ',' ','; s ':' ':'; s ';' ';'; s ' <Xs'; s ' _r_s' ' rs'; s '# External links' '# External Links'; s '# See also' '# See Also'; s '"abstract-collapse abstract"' '"abstract abstract-collapse"'; s "‐" "-"; s 'class="link-auto"' ''; s '𝑂(' '𝒪('; s ' and ' ' & '; s '' ''; s '' ''; s 'augmentation,' 'augmentation,'; s 'Bitcoin,' 'Bitcoin,'; s 'class="invertible"' 'class="invert"'; s '”>' '">'; s '
' '
'; s '
' '
'; s '530px.jpg-530px.jpg' '530px.jpg'; s ' id="cb1"' ''; s ' id="cb2"' ''; s ' id="cb3"' ''; s ' id="cb4"' ''; ## TODO: duplicate HTML classes from Pandoc reported as issue #8705 & fixed; fix should be in >pandoc 3.1.1 (2023-03-05), so can remove these two rewrites once I upgrade past that: s 'class="odd odd' 'class="odd'; s 'class="even even' 'class="even'; s '  ' ' '; s '​ ' ' '; ) &> /dev/null & sed -i -e 's/ data-link-?[Tt]ags="[a-z0-9 \/-]\+">/>/' ./metadata/*.yaml; fi bold "Compiling…" cd ./static/build compile () { ghc -O2 -Wall -rtsopts -threaded --make "$@"; } compile hakyll.hs compile generateLinkBibliography.hs compile generateDirectory.hs compile preprocess-markdown.hs & compile guessTag.hs & ## NOTE: generateSimilarLinks.hs & link-suggester.hs are done at midnight by a cron job because ## they are too slow to run during a regular site build & don't need to be super-up-to-date ## anyway cd ../../ if [ "$SLOW" ]; then bold "Checking embeddings database…" ghci -i/home/gwern/wiki/static/build/ ./static/build/GenerateSimilar.hs -e 'e <- readEmbeddings' &>/dev/null # duplicates a later check but if we have a fatal link error, we'd rather find out now rather than 30 minutes later while generating annotations: λ(){ grep -F -e 'href=""' -- ./metadata/*.yaml || true; } wrap λ "Malformed empty link in annotations?" # We update the linkSuggestions.el in a cron job because too expensive, and vastly slows down build. # Update the directory listing index pages: there are a number of directories we want to avoid, # like the various mirrors or JS projects, or directories just of data like CSVs, or dumps of # docs, so we'll blacklist those: DIRECTORY_TAGS="$(find doc/ fiction/ haskell/ newsletter/ nootropic/ note/ review/ zeo/ -type d \ | sort | grep -F --invert-match -e 'doc/www' -e 'doc/rotten.com' -e 'doc/genetics/selection/www.mountimprobable.com' \ -e 'doc/biology/2000-iapac-norvir' -e 'doc/gwern.net-gitstats' -e 'doc/rl/armstrong-controlproblem' \ -e 'doc/statistics/order/beanmachine-multistage' -e 'doc/personal/2011-gwern-yourmorals.org/')" PAGES_BIBLIOGRAPHIES="$(find . -type f -name "*.page" | sort | grep -F --invert-match -e 'index.page' -e '404.page' | sed -e 's/\.\///' | shuf; find . -type f -name "index.page"|grep -F --invert-match -e 'doc/') index.page" # wait for generateLinkBibliography to finish to ensure the annotation link-bibs are all created: bold "Updating link bibliographies…" ./static/build/generateLinkBibliography +RTS -N"$N" -RTS # we want to generate all directories first before running Hakyll in case a new tag was created bold "Building directory indexes…" ./static/build/generateDirectory +RTS -N"$N" -RTS $DIRECTORY_TAGS fi bold "Check/update VCS…" cd ./static/ && (git status; git pull; git push --verbose &) cd ./build/ # Cleanup pre: rm --recursive --force -- ~/wiki/_cache/ ~/wiki/_site/ ./static/build/hakyll ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks ./static/build/link-extractor ./static/build/link-suggester || true cd ../../ # go to site root bold "Building site…" time ./static/build/hakyll build +RTS -N"$N" -RTS || (red "Hakyll errored out!"; exit 1) if [ "$SLOW" ]; then bold "Updating annotation/quote-of-the-day…" # NOTE: we do this at the end, instead of inside hakyll.hs, to avoid spurious uses when a compile fails ghci -i/home/gwern/wiki/static/build/ ./static/build/QuoteOfTheDay.hs \ -e 'do {md <- LinkMetadata.readLinkMetadata; aotd md; qotd; lotd; }' | \ grep -F --invert-match -e ' secs,' -e 'it :: [T.Text]' -e '[]'; λ(){ ghci -i/home/gwern/wiki/static/build/ ./static/build/QuoteOfTheDay.hs -e 'sitePrioritize' | \ grep -F --invert-match -e ' secs,' -e 'it :: [T.Text]' -e '[]' || true; } wrap λ "Site-of-the-day: check for recommendation?" fi bold "Results size:" du -chs ./_cache/ ./_site/ echo "Raw file count: $(find ./_site/ -type f | wc --lines)" echo "Total (including hardlinks) file count: $(find ./_site/ -type f -or -type l | wc --lines)" # cleanup post: rm -- ./static/build/hakyll ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks ./static/build/link-extractor &>/dev/null || true ## WARNING: this is a crazy hack to insert a horizontal rule 'in between' the first 3 sections ## on /index (Newest/Popular/Notable), and the rest (starting with Statistics); the CSS for ## making the rule a block dividing the two halves just doesn't work in any other way, but ## Pandoc Markdown doesn't let you write stuff 'in between' sections, either. So… a hack. sed -i -e 's/section id=\"statistics\"/hr class="horizontal-rule-nth-1" \/>

" ## very static files which rarely change: PDFs, images, site infrastructure: find -L _site/doc/ _site/image/ _site/static/ -not -name "*.page" -type f | grep -F --invert-match -e 'doc/www/' -e 'metadata/' -e '.git' -e '404' -e '/static/template/default.html' -e '-530px.jpg' -e '-768px.png' -e 'lorem' | grep -E --invert-match -e '/doc/.*/index' -e 'static/.*\..*\.html$' -e 'doc/.*\..*\.html$' | \ sort | xargs urlencode -m | sed -e 's/%20/\n/g' | \ sed -e 's/_site\/\(.*\)/\\https:\/\/gwern\.net\/\1<\/loc>never<\/changefreq><\/url>/' ## Everything else changes once in a while: find -L _site/ -not -name "*.page" -type f | grep -F --invert-match -e 'static/' -e 'doc/' -e 'image/' -e 'fulltext' -e 'lorem' -e 'metadata/' -e '-768px.' -e '.page.html'| \ grep -E --invert-match -e '/.*/index' -e '.page$' | \ sort | xargs urlencode -m | sed -e 's/%20/\n/g' | \ sed -e 's/_site\/\(.*\)/\\https:\/\/gwern\.net\/\1<\/loc>monthly<\/changefreq><\/url>/' echo "") >> ./_site/sitemap.xml ## generate a syntax-highlighted HTML fragment (not whole standalone page) version of source code files for popup usage: ### We skip .json/.jsonl/.csv because they are too large & Pandoc will choke; and we truncate at 1000 lines because such ### long source files are not readable as popups and their complexity makes browsers choke while rendering them. ### (We include plain text files in this in order to get truncated versions of them.) bold "Generating syntax-highlighted versions of source code files…" syntaxHighlight () { #### NOTE: for each new extension, add a `find` name, and an entry in `extracts-content.js` declare -A extensionToLanguage=( ["R"]="R" ["c"]="C" ["py"]="Python" ["css"]="CSS" ["hs"]="Haskell" ["js"]="Javascript" ["patch"]="Diff" ["diff"]="Diff" ["sh"]="Bash" ["bash"]="Bash" ["html"]="HTML" ["conf"]="Bash" ["php"]="PHP" ["opml"]="Xml" ["xml"]="Xml" ["page"]="Markdown" # NOTE: we do 'text' to get a 'syntax-highlighted' version which has wrapped columns etc. ["txt"]="" ["yaml"]="YAML" ["jsonl"]="JSON" ["json"]="JSON" ["csv"]="CSV" ) for FILE in "$@"; do FILEORIGINAL=$(echo "$FILE" | sed -e 's/_site//') FILENAME=$(basename -- "$FILE") EXTENSION="${FILENAME##*.}" LANGUAGE=${extensionToLanguage[$EXTENSION]} FILELENGTH=$(cat "$FILE" | wc --lines) (echo -e "~~~~~~~~~~~~~~~~~~~~~{.$LANGUAGE}"; # NOTE: excessively long tilde-line is necessary to override/escape any tilde-blocks inside Markdown files: if [ $EXTENSION == "page" ]; then # the very long lines look bad in narrow popups, so we fold: cat "$FILE" | fold --spaces --width=70 | sed -e 's/~~~/∼∼∼/g' | head -1100 | iconv -t utf8 -c; else cat "$FILE" | head -1000; fi echo -e "\n~~~~~~~~~~~~~~~~~~~~~" if (( $FILELENGTH >= 1000 )); then echo -e "\n\n…[File truncated due to length; see original file]…"; fi; ) | pandoc --mathjax --write=html5 --from=markdown+smart | \ ## delete annoying self-link links: Pandoc/skylighting doesn't make this configurable sed -e 's/