#!/bin/bash
# When: Time-stamp: "2023-03-29 18:05:17 gwern"
# see https://gwern.net/About#markdown-checker
set +x
# function to wrap checks and print highlighted warning if non-zero output (self-documenting):
wrap() { OUTPUT=$($1 2>&1)
WARN="$2"
if [ -n "$OUTPUT" ]; then
echo -e "\e[41m$WARN\e[0m":
echo -e "$OUTPUT";
fi; }
fgp () { grep -F --context=1 --line-number --color=always "$@"; }
egp () { grep -E --ignore-case --context=1 --line-number --color=always "$@"; }
for PAGE in "$@"
do
if [[ $PAGE == *.page ]]; then
λ(){ file "$PAGE" | fgp --invert-match 'text'; }
wrap λ "Not text, perhaps due to bad copy-paste"
λ(){ egp '[^[:print:]]' "$PAGE"; }
wrap λ "File contains unprintable characters"
λ(){ fgp -e 'http://dl.dropbox' -e '.wiley.com/doi/abs/' \
-e 'www.tandfonline.com/doi/abs/' -e 'jstor.org' -e 'springer.com' -e 'springerlink.com' \
-e 'www.mendeley.com' -e 'academia.edu' -e 'researchgate.net' -e 'pdf.yt' \
-e 'photobucket' -e 'imgur.com' -e 'hathitrust.org' -e 'emilkirkegaard.dk' -e 'arthurjensen.net' \
-e 'humanvarieties.org' -e 'libgen.io/' -e 'gen.lib.rus.ec/' -e 'sci-hub.bz/' -e '](http://www.scilogs.com/' \
-e 'sci-hub.cc/' -e "papers.nber.org/" -e '](!wikipedia' -e '](!wikipedia)'"'s" -e 'https://wwww.' -e 'http://wwww.' \
-e 'http://33bits.org' -e 'https://gwern.net' -e 'https://gwern.net' -e 'web.archive.org/web/2' \
-e 'webarchive.org.uk/wayback/' -e 'webcitation.org' -e 'plus.google.com' -e 'www.deepdotweb.com' -e 'raikoth.net' \
-e 'drive.google.com/file' -e 'ssrn.com' -e 'ardenm.us' -e 'gnxp.nofe.me' -e 'psycnet.apa.org' \
-e 'wellcomelibrary.org/item/' -e 'dlcs.io/pdf/' -e 'secure.wikimedia.org' \
-e 'https://biorxiv.org' \
-e 'fbclid=' -e '?gid=' -e 'twitter.com/#!' -e 'pay.reddit.com' -e 'europepmc.org' -e 'drugcite.com' \
-e 'guardian.co.uk' -e 'mlp.wikia.com' -e '฿' -e '!Wikipedia ""' -e 'medium.com' -e 'temcauley.staff.shef.ac.uk' \
-e 'yahoo.com' -e 'bloomberg.com' -e '.wsj.com' -e 'extremelongevity.net' -e 'blog.openai.com' \
-e 'https://ww.gwern.net' -e 'https://w.gwern.net' -e 'www.heretical.com' -e 'books.google.ca' \
-e 'lesserwrong.com' -e 'au.news.yahoo.com' -e 'northjersey.com' -e 'tribune.com.pk' -e 'idsnews.com' \
-e 'catsensebook.com' -e 'whec.com' -e 'www.mercurynews.com' -e 'meetup.com' -e 'www.reddit.com' \
-e 'dlcs.io/' -e 'centerforcollegeaffordability.org' -e 'quora.com' -e 'times-news.com' -e 'www.cebp.nl' \
-e '#filmtv' -e 'nybooks.com' -e '
' -e 'annualreviews.org' \
-e 'dspace.mit.edu' -e 'shirky.com' -e '](http://www.nzherald.co.nz)' -e 'https://www.arxiv.org' \
-e 'goodreads.com/review/show' -e 'myanimelist.net/reviews.php?id=' \
-e 'cloudfront.net' -e 'https://www.amazon.com/s?ie=UTF8&field-isbn=&page=1&rh=i:stripbooks' -e 'http://ltimmelduchamp.com' \
-e 'thiswaifudoesnotexist.net)' -e 'thiswaifudoesnotexist.net"' -e 'www.wikilivres.ca' -e 'worldtracker.org' \
-e 'meaningness.wordpress.com' -e 'ibooksonline.com' -e tinypic.com -e isteve.com -e 'j-bradford-delong.net' -- "$PAGE";
egp -e 'https://arxiv.org/abs/[0-9]\{4\}\.[0-9]\+v[0-9]' -- "$PAGE";}
wrap λ "find bad URLS, unacceptable/unreliable/risky domains, malformed syntax, unmatched apostrophes"
λ(){ fgp -e 'http://news.ycombinator.com' -e 'http://github.com' -e 'http://www.ncbi.nlm.nih.gov/pubmed/' \
-e 'http://www.coursera.org' -e 'http://en.wikipedia.org' -e 'http://biorxiv.org' -e 'http://www.biorxiv.org' \
-e 'http://arxiv.org' -e 'http://slatestarcodex.com' -e 'http://arxiv.org' -e 'http://www.arxiv.org' \
-e 'http://myanimelist.net' -e 'http://www.bmj.com' -e 'http://www.youtube.com' -e 'http://youtu.be' -e "http://www.nature.com/" \
-e "http://www.sciencedirect.com" -e "http://journals.plos.org" -e "http://www.pnas.org" -e "http://www.wsj.com" \
-e "http://link.springer.com" -e "http://www.bbc.com" -e "http://genomebiology.biomedcentral.com" -e "http://www.npr.org" \
-e "http://www.ipscell.com" -e "http://www.newyorker.com" -e "http://www.nytimes.com" -e 'http://ask.metafilter.com' \
-e 'http://www.metafilter.com' -e 'http://www.vanityfair.com' -e 'http://econlog.econlib.org' -e 'http://www.overcomingbias.com' \
-e 'http://www.economist.com' -e 'http://www.theverge.com' -- "$PAGE"; }
wrap λ "HTTP → HTTPS URLs"
## ban articles written by John Hewitt; he endorses the pig-human pseudoscience, lies about research (eg claiming platypus genome proven to be a bird hybrid), and makes bad arguments (eg. his criticism of senolytics because senescent cells do not have a single unique universal signature):
λ(){ grep -F -e 'phys.org' -- "$PAGE" | fgp -v -e '2019-07-cat-science.html' -e '2017-08-cavemen-genetic-checkup.html' -e'2019-12-mouse-pups-born-eggs-derived.html'; }
wrap λ "Phys.org link detected: make sure John Hewitt didn't write it"
λ(){ ~/wiki/static/build/Columns.hs "$PAGE"; }
wrap λ "Add columns wrapper?"
λ(){ runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | egp --only-matching -e '^http://.*archive\.org/.*\.pdf$'; }
wrap λ "check for aggregator-hosted PDFs and host them on Gwern.net to make them visible to Google Scholar/provide backups"
λ(){ runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | egp --only-matching -e '^http://twitter.com/' -e 'https://.*twitter.com/.+/status/[0-9]+'; }
wrap λ "Switch Twitter.com to Nitter links"
λ(){ egp -e 'http://www.pnas.org/content/.*/.*/.*.abstract' -e '[^\.]t\.test\(' -e '^\~\~\{\.' \
-e 'ncbi.nlm.nih.gov/pubmed/[[:digit:]][[:digit:]]*' \
-e 'biorxiv.org/content/biorxiv/.*\.pdf ' -e '(https://www.biorxiv.org/content/biorxiv/.*\.pdf)' \
-e 'arxiv.org/pdf/.*\.pdf)' -e 'arxiv.org/pdf/.*\.pdf "' -- "$PAGE"; }
wrap λ "if I am not linking a specific page on Arxiv or BioRxiv, why am I linking to the PDF rather than the landing page?"
λ(){ fgp -e ".pdf#subsection" -e ".pdf#Appendix" -- "$PAGE"; }
wrap λ "Section PDF links break when archived locally (thereby breaking the local-PDF popups), so avoid unusual anchors in favor of 'page=N' anchor links"
λ() { egp -e '
' -e ' "\)[ );,$]' \
-e 'cssExtension: [a-c,e-z]' -e '^R> ' -e '^#+ Comments$' -- "$PAGE";
fgp -e '(www' -e ')www' -e '' -e ' )' -e '](//' -e '](/wiki/' -e '](wiki/' -e '——–' -e '——' -e '————–' -e ' --- ' \
-e ' percent ' -e " Pearson'" -e '~~~{.sh}' -e 'library("' -e ' +-' -e ' -+' -e '"collapse Summary"' -e '"CollapseSummary"' -e 'collapseSumary' -e ' [a-Z]' -e '^ > [a-Z]' -e '^ > [a-Z]' -e '^ - [a-Z]' -e '^ - [a-Z]' \
-e '
' -e '^' -e '^' -e '^' -e ' : ' -e ']^[' -- "$PAGE"; }
wrap λ "look for broken syntax in original Markdown: (NOTE: footnotes should not be linked to because they are unstable; they should either be sections/appendices, or given a long-term div ID)"
λ() { grep --perl-regexp --null-data --only-matching '(?s)\n\<\/div\>\n\n\^\[\!Margin: .....' -- "$PAGE"; }
wrap λ "Margin note possibly breaks drop caps by being first item on the first line after an abstract"
λ() { grep -P -e '[\x{0590}-\x{05FF}]|[\x{0600}-\x{06FF}]' -- "$PAGE"; }
wrap λ "Check that bidirectional scripts (Hebrew, Arabic) are not displayed; can cause Firefox Mac rendering bugs page-wide"
λ(){ grep -F '~~~{.' -- "$PAGE" | tr -d '{}~' | tr ' ' '\n' | \
grep -F -v -e '.R' -e '.collapse' -e '.Haskell' -e '.Bash' -e '.Diff' -e '.Javascript' -e '.numberLines' \
-e '.Python' -e '.C ' -e '.CPO' -e '.SQL' -e '.Bibtex' -e '.HTML' -e '.CSS'; }
wrap λ "look for potentially broken syntax-highlighting classes"
λ(){ grep -E --invert-match '[[:space:]]*>' -- "$PAGE" | fgp -e ' significant ' -e ' significantly ' -e ' obvious' -e 'basically' -e ' the the ' -e 'reproducibility crisis' -e 'replicability crisis'; } # WARNING: can't use 'egp' for some reason
wrap λ "look for personal uses of illegitimate statistics & weasel words, but filter out blockquotes"
λ(){ fgp -e ' feet' -e ' foot ' -e ' pound ' -e ' mile ' -e ' miles ' -e ' inch' -- "$PAGE";
egp -e '[0-9][0-9]* \(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December\) [0-9][0-9][0-9][0-9]' -- "$PAGE"; }
wrap λ "look for English/imperial units as a reminder to switch to metric as much as possible"
λ(){ grep -E --only-matching '^\[\^.*\]: ' -- "$PAGE" | sort | uniq --count | \
grep -F --invert-match ' 1 [^'; }
wrap λ "check for duplicate footnote IDs (force no highlighting, because the terminal escape codes trigger bracket-matching)"
λ(){ egp --only-matching '\!\[.*\]\(wiki/.*\)' -- "$PAGE"; }
wrap λ "indicates broken copy-paste of image location"
λ(){ grep --perl-regexp --null --only-matching -e '\!\[.*\]\(.*\)\n\!\[.*\]\(.*\)' -- "$PAGE"; }
wrap λ "look for images used without newline in between them; in some situations, this leads to odd distortions of aspect ratio/zooming or something (first discovered in Correlation.page in blockquotes)"
λ(){ egp '^[^$]* [^\"]\$[^$]*$' -- "$PAGE"; }
wrap λ "look for unescaped single dollar-signs (risk of future breakage)"
λ(){ egp -e '[a-zA-Z]- ' -- "$PAGE"; }
wrap λ "Write out shortcuts"
λ(){ fgp -e '= ~' -- "$PAGE" | fgp --invert-match ' mods'; }
wrap λ "Unicodify: instead of writing 'x = ~y', unicode as '≈'"
λ(){ fgp -e '?!' -e '!?' -e '<->' -e '~>' -- "$PAGE"; }
wrap λ "Unicodify: misc"
λ(){ egp -e '[[:alnum:]]≠[[:alnum:]]' -- "$PAGE"; }
wrap λ "Unicodify: != renders better with spaces around it"
λ(){ egp -e '\$[[:alnum:]]\$' -e '\$\\sqrt{[[:digit:]]}\$' -e '\$[[:alnum:]]^[[:alnum:]]\$' -e '\$[[:alnum:]]+ \cdot [[:alnum:]]+\$' -e '\$[[:alnum:]]\$' \
-e '\$\\sqrt{[[:digit:]]}\$' -e '\$log_2\$' -e '\$log_10\$' -e '\$\\mathcal{N}' -- "$PAGE"; }
wrap λ "LaTeX: simplify to Unicode/Markdown"
λ(){ egp -e '\$\\frac{[0-9]\+}{[0-9]\+}' -- "$PAGE"; }
wrap λ "Unicodify: LaTeX for simple numerical fractions is overkill; use '⁄' FRACTION SLASH instead"
λ(){ grep -F ' \\times ' -- "$PAGE"; }
wrap λ "LaTeX: \\cdot is nicer"
λ(){ grep -F '$$E(' -- "$PAGE"; }
wrap λ "LaTeX: use \\mathbb for expectations"
λ(){ egp -e '[a-zA-Z]→[a-zA-Z]' -e '[a-zA-Z]←[a-zA-Z]' -e '[a-zA-Z]↔[a-zA-Z]' -- "$PAGE"; }
wrap λ "Add spaces to arrows: more legible, fewer odd interactions (like Hyphenator)"
λ(){ fgp -i -e '
' -e '
' -e '
' \
-e '**Warn' -e '**Note' -e '**Error' -- "$PAGE"; }
wrap λ "Reminder to use formal admonitions instead of just bolding"
λ() { egp -e '^[0-9]\. \*[^\*]' -e '^- \*[^\*][a-Z]' -e '^- \[.*\]\{\.smallcaps\}' -- "$PAGE"; }
wrap λ "Reminder to use bold as the top level of emphasis in lists rather than italics or smallcaps"
[ "$(grep -E '^title: ' "$PAGE" | wc --char)" -le 10 ] && echo -e '\e[41mWARNING\e[0m: "title:" metadata too short.'
[ "$(grep -E '^title: ' "$PAGE" | wc --char)" -ge 60 ] && echo -e '\e[41mWARNING\e[0m: "title:" metadata too long.'
[ "$(grep -E '^description: ' "$PAGE" | wc --char)" -le 90 ] && echo -e '\e[41mWARNING\e[0m: "description:" metadata too short.'
[ "$(grep -E '^description: ' "$PAGE" | wc --char)" -ge 320 ] && echo -e '\e[41mWARNING\e[0m: "description:" metadata too long.'
[ "$(grep -E '^next: ' "$PAGE" | wc --char)" -eq 0 ] && echo -e '\e[41mWARNING\e[0m: "next:" metadata is missing.'
[ "$(grep -E '^previous: ' "$PAGE" | wc --char)" -eq 0 ] && echo -e '\e[41mWARNING\e[0m: "previous:" metadata is missing.'
[ "$(grep -E '^thumbnail: ' "$PAGE" | wc --char)" -le 20 ] && echo -e '\e[41mWARNING\e[0m: No thumbnail/illustration defined.'
# skip on newsletters since their URLs are always being modified:
[[ ! $PAGE =~ "newsletter/" ]] && [ "$(grep -E '^modified: 20' "$PAGE" | wc --char)" -eq 0 ] && echo -e '\e[41mWARNING\e[0m: "modified:" metadata is missing.'
λ() { markdown-length-checker.hs "$PAGE";}
wrap λ "Source code line lengths"
λ() { markdown-footnote-length.hs "$PAGE"; }
wrap λ "Footnote lengths"
# proselint configuration is done in ~/.proselintrc; for Markdown, currently disabled: "archaism.misc" "cursing.nfl" "links.broken" "misc.but" "misc.chat speak" "misc.phrasal_adjectives" "sexism.misc" "typography.diacritical_marks" "typography.exclamation" "typography.symbols" "typography.symbols.ellipsis" "typography.symbols.curly_quotes"
λ() { proselint "$PAGE"; }
wrap λ "Proselint nitpicks"
## reused later as well:
HTML=$(mktemp --suffix=".html")
cat "$PAGE" | pandoc --metadata lang=en --metadata title="Test" --mathml --to=html5 --standalone --number-sections --toc --reference-links --css=https://gwern.net/static/css/default.css -f markdown+smart --template=/home/gwern/bin/bin/pandoc-template-html5-articleedit.html5 - --output="$HTML"
λ() { tidy -quiet -errors --doctype html5 "$HTML" 2>&1 >/dev/null | grep -F -v -e 'Warning:
![]()
proprietary attribute "loading"'; }
wrap λ "HTML validation problems"
λ() { cat "$HTML" | elinks -dump --force-html | grep -E 'file:///dev/stdin#.*\..*'; }
wrap λ "Header problem: period present (valid HTML but breaks CSS/JS!). Override default Pandoc link fragment with '{#header-without-period}'"
λ() { COLLAPSED=$(cat "$HTML" | grep -E --after-context=3 '
' | wc --lines)
MISSING=$(( COLLAPSED_SECTION_COUNT - COLLAPSED_SUMMARY_COUNT ))
if [[ $MISSING != 0 ]];
then echo "Missing collapsed section summaries?"
echo "Sections: $COLLAPSED_SECTION_COUNT ; summaries: $COLLAPSED_SUMMARY_COUNT"
echo "Hits: $COLLAPSED"
fi; }
wrap λ "Missing collapse summaries"
λ() { fgp -e "<""del"">" "$HTML";
elinks -dump --force-html "$HTML" \
| fgp -e '\frac' -e '\times' -e '(http' -e ')http' -e '[http' -e ']http' \
-e ' _ ' -e '[^' -e '^]' -e '/* ' -e ' */' -e '' -e '<-- ' -e '<—' -e '—>' \
-e '$title$' -e '' -e '.smallcaps' -e '' -e '' \
-e '$description$' -e '$author$' -e '$tags$' -e '$category$' \
-e '(!Wikipedia' -e '(!Hoogle' -e 'http://www.gwern.net' -e 'http://gwern.net' -e 'https://www.gwern.net' -e 'smallcaps}' \
-e '!Marin' -e '**' \
| egp -e '\!Margin:.*↩'; } # ))
wrap λ "look for syntax errors making it to the final HTML output"
λ(){ runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | grep -E -v -e "^http" -e '^!Wikipedia' -e '^#' -e '^/' -e '^\!' -e '^\$'; }
wrap λ "special syntax shouldn't make it to the compiled HTML"
λ() { runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | grep -E -v -e '^\!' -e '^\$' | sort | uniq --count | sort --numeric-sort | grep -E -v -e '.* 1 '; }
wrap λ "Duplicate links"
λ(){ egp --only-matching '\!\[.*\]\(http://.*\)' -- "$PAGE"; }
wrap λ "image hotlinking deprecated; impolite, and slows page loads & site compiles"
# Note links which need to be annotated (probably most of them...)
λ() { runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | grep -E -v -e '^\!' -e '^\$' -e '^/doc/.*txt' -e '.xz$' -e '^#' -e '.patch$' -e '.jpg$' -e '.png$' -e '.mp4' -e '.mp3' -e 'news.ycombinator.com' -e 'old.reddit.com' -e 'youtube.com' -e 'youtu.be/' -e 'amazon.com' -e 'bandcamp.com' -e 'dropbox.com' -e 'vocadb.net' -e 'twitter.com' -e 'nitter.moomoo.me' -e '#link-bibliography' -e 'https://en.wikipedia.org/wiki' | runghc -istatic/build/ static/build/link-prioritize.hs; }
wrap λ "Link annotations required"
# we use link annotations on URLs to warn readers about PDFs; if a URL ends in 'pdf', it gets a PDF icon. What about URLs which redirect to or serve PDF?
# we must manually annotate them with a '#pdf'. Check URLs in a page for URLs which serve a PDF MIME type but do not mention PDF in their URL:
λ() { checkPDF() {
MIME=$(timeout 20s curl --insecure --write-out '%{content_type}' --silent -o /dev/null "$@" | grep -F -i -e "application/pdf" -e "application/octet-stream")
if [ ${#MIME} -gt 5 ]; then
if [[ ! $@ =~ .*pdf.* ]] && [[ ! $@ =~ .*PDF.* ]]; then
echo "UNWARNED NON-LOCAL PDF: " "$@" "$MIME"
fi
echo "NON-LOCAL PDF: $@"
fi; }
export -f checkPDF
# examples: no/no/yes
## checkPDF 'http://www.nytimes.com/2009/11/15/business/economy/15view.html ' # no
## checkPDF 'http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.208.2314&rep=rep1&type=pdf' # yes
## checkPDF 'https://files.osf.io/v1/resources/np2jd/providers/osfstorage/59614dec594d9002288271b6?action=download&version=1&direct' # yes
runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | grep -E "^http" | grep -F -v -e 'https://gwern.net' -e arxiv.org -e pnas.org | sort -u | shuf | parallel -n 1 checkPDF; }
wrap λ "Non-icon/warned PDF links"
λ() { for PDF in $(runghc -i/home/gwern/wiki/static/build/ ~/wiki/static/build/link-extractor.hs "$PAGE" | grep -E -e '^/doc/' -e 'https:\/\/www\.gwern\.net\/' | \
grep -E '\.pdf$' | sed -e 's/\/doc/doc/' -e 's/https:\/\/www\.gwern\.net//' ); do
TITLE=$(exiftool -printFormat '$Title' -Title ~/wiki/"$PDF")
AUTHOR=$(exiftool -printFormat '$Author' -Author ~/wiki/"$PDF")
DATE=$(exiftool -printFormat '$Date' -Date ~/wiki/"$PDF")
DOI=$(exiftool -printFormat '$DOI' -DOI ~/wiki/"$PDF")
TEXT=$(pdftotext ~/wiki/"$PDF" - 2>/dev/null)
TEXT_LENGTH=$(echo "$TEXT" | wc --chars)
PREPRINT=$(echo "$TEXT" | grep -F -e 'This is a PDF file of an unedited manuscript that has been accepted for publication.' \
-e 'use of the JSTOR database indicates your acceptance' \
-e 'you may not download an entire issue of a journal' \
-e 'This article appeared in a journal published by Elsevier.' \
-e 'institutional repository. Authors requiring further information' \
-e 'This article has been accepted for publication and undergone full peer review' \
-e 'the copyediting, typesetting, pagination and proofreading process' \
-e 'Changes resulting from the publishing process, including peer review,' \
-e 'may lead to differences between this version and the Version of Record.' )
if [[ -n "$PREPRINT" ]]; then
echo "$HOME/wiki/$PDF: preprint or front-page garbage ($PREPRINT)"
fi
if (( $TEXT_LENGTH < 1024 )); then
echo "$HOME/wiki/$PDF OCR text length: $TEXT_LENGTH"
fi
if [[ -z $TITLE || -z $AUTHOR || -z $DATE || -z $DOI ]]; then
exiftool -Title -Author -Date -DOI ~/wiki/"$PDF"
fi
done; }
wrap λ "Linked PDFs have missing OCR or missing metadata fields"
# Finally, check for broken external links; ignore local URLs which are usually either relative links to
# other pages in the repo or may be interwiki links like '!Wikipedia'.
λ() { linkchecker --no-status --check-extern --threads=1 --timeout=20 -r1 --ignore='file://.*' "$HTML"; }
wrap λ "Broken links"
fi
if [[ $PAGE == *.sh ]]; then
shellcheck "$PAGE"
fi
if [[ $PAGE == *.hs ]]; then
hlint "$PAGE"
fi
if [[ $PAGE == *.html ]]; then
tidy -quiet -errors --doctype html5 "$PAGE"
fi
done