#!/bin/bash # Author: Gwern Branwen # Date: 2016-10-01 # When: Time-stamp: "2024-10-30 19:43:22 gwern" # License: CC-0 # # Bash helper functions for Gwern.net wiki use. # Intended to be sourced in `~/.bashrc` like `. ~/wiki/static/build/bash.sh` # # Helper functions include: website cache invalidation; PDF metadata query & editing; tab-completion for tag/directory names; querying the newsletters, filenames, annotation database, website, and IRC (in increasing generality); site-wide string search-and-replace (including a HTTP→HTTPS special-case which comes up a lot fixing redirects); functions to rename files & directories; and a `gwtag` CLIcommand which create new annotations (if there is a usable annotation source, such as being an Arxiv link) or add/remove tags from specified files/URLs. # # See also: /static/build/{upload, gwa, crossref, compressJpg2} source /usr/share/bash-completion/bash_completion || true # useful for better `upload` tab-completion # Bash completion function for ghc/ghci Haskell source files # # Features: # - Completes Haskell source files (.hs) and directories # - Excludes object (.o) and interface (.hi) files # - Case-insensitive matching # - Recursive directory search (depth limit: 5) # - Adds trailing slashes to directories without appending spaces # # Usage: Source this file or add it to your bash_completion.d directory # Note: we wrote this one because the official GHC bash tab-completion script # # has not been modified in a decade, doesn't seem to be shipped at all by # , and I don't really use the # limited completions that old one offers. _ghc_ghci_completion () { local cur lower_cur _init_completion || return local IFS=$'\n' local valid_completions=() shopt -s nocaseglob nullglob add_completions () { local dir="$1" prefix="$2" depth="$3" [[ $depth -gt 5 ]] && return for file in "$dir"*; do local rel_file="${file#$dir}" if [[ -d "$file" ]]; then valid_completions+=("$prefix$rel_file/") add_completions "$file/" "$prefix$rel_file/" $((depth + 1)) elif [[ "$file" == *.hs && ! "$file" =~ \.o$|\.hi$ ]]; then valid_completions+=("$prefix$rel_file") fi done } add_completions "./" "" 0 shopt -u nocaseglob nullglob lower_cur=${cur,,} mapfile -t COMPREPLY < <(for c in "${valid_completions[@]}"; do [[ "${c,,}" == "$lower_cur"* ]] && printf '%s\n' "$c" done) [[ ${#COMPREPLY[@]} -eq 1 && ${COMPREPLY[0]} == */ ]] && compopt -o nospace } complete -o nospace -F _ghc_ghci_completion ghc ghci # Default Haskell parallelism: export N="7" set -e bold () { echo -e "\033[1m$@\033[0m"; } red () { echo -e "\e[41m$@\e[0m"; } ## function to wrap checks and print red-highlighted warning if non-zero output (self-documenting): wrap () { OUTPUT=$($1 2>&1) WARN="$2" if [ -n "$OUTPUT" ]; then echo -n "Begin: "; red "$WARN"; echo -e "$OUTPUT"; echo -n "End: "; red "$WARN"; fi; } ge () { grep --extended-regexp "$@"; } gec () { ge --color=always "$@"; } gev () { ge --invert-match "$@"; } gf () { grep --fixed-strings "$@"; } gfc () { gf --color=always "$@"; } gfv () { gf --invert-match "$@"; } export -f bold red wrap ge gec gev gf gfc gfv file2Path () { echo "$1" | sed -e 's/~\/wiki//' -e 's/\/home\/gwern\/wiki\//\//' -e 's/\.\//\//' -e 's/https:\/\/gwern\.net//g' -e 's/^/\//' -e 's/^\/\//\//'; } path2File () { # This function takes any number of arguments as input paths or URLs # and converts them to a file path under the ~/wiki/ directory. # Loop through all arguments passed to the function for arg in "$@" do if [ -f "$arg" ]; then echo "$arg"; else # Convert the input to the desired file path using sed # The following transformations are applied: # 1. Strip 'wiki/' prefix if present # 2. Replace '~/' with '/home/gwern/wiki/' # 3. Prepend '/home/gwern/wiki/' if the path starts with an alphanumeric character # 4. Remove 'https://gwern.net' if present to support URLs from gwern.net # 5. Special handling for paths starting with '/doc' to ensure they go into the 'doc' subdirectory correctly ARG=$(echo "$arg" | sed -e 's/^wiki\///' \ -e 's/\~\//\/home\/gwern\/wiki\//' \ -e 's/^\([a-zA-Z0-9].*\)/\/home\/gwern\/wiki\/\1/' \ -e 's/https:\/\/gwern\.net//g' \ -e 's/^\/doc/\/home\/gwern\/wiki\/doc/') echo "$ARG" fi done } export -f path2File cloudflare-expire () { ARGS=$(path2File "$@") for FILE in $(realpath $ARGS); do URL="$(echo $FILE | sed -e 's/\.md//' -e 's/\/home\/gwern\/wiki\/\(.*\)/https:\/\/gwern\.net\/\1/g' -e 's/\.md//g' | sort )" echo -n "Expiring: $FILE → $URL : " curl --silent --request POST "https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_TARGET/purge_cache" \ --header "X-Auth-Email:gwern@gwern.net" \ --header "Authorization: Bearer $CLOUDFLARE_CACHE_TOKEN" \ --header "Content-Type: application/json" \ --data "{\"files\":[\"$URL\"]}" | jq '.success' (curl --silent "$URL" &) > /dev/null done } export -f cloudflare-expire cloudflare-expire-all () { (find ~/wiki/ -name "*.md" -type f -not -wholename "*/\.*/*" -not -wholename "*/_*/*" | sort; find ~/wiki/static/ -type f | sort) | \ sed -e 's/\.md//' -e 's/^\.\/\(.*\)$/https:\/\/gwern\.net\/\1/' | sort | parallel cloudflare-expire; } pdf () { for ARG in "$@"; do TARGET=$(echo "$ARG" | sed -e 's/^\/doc\//\/home\/gwern\/wiki\/doc\//') (evince "$TARGET" &); done; } # delete the first page of the PDF. This is useful to remove the spam in PDFs from JSTOR and many other academic publishers. (Some of them do 2 or even 3 spam pages, but you can just run `pdfcut` repeatedly quickly with arrow in bash, of course.) pdfcut () { for PDF in "$@"; do ORIGINAL=$(path2File "$PDF") TARGET=$(mktemp /tmp/XXXXXX.pdf); pdftk "$ORIGINAL" cat 2-end output "$TARGET" && # pdftk by default erases all metadata, so we need to copy it all to the new PDF: exiftool -TagsFromFile "$ORIGINAL" "$TARGET" && mv "$TARGET" "$ORIGINAL" || rm "$TARGET"; (crossref "$ORIGINAL" &); done } # delete the last page of the PDF, similar to `pdfcut`: pdfcut-last () { if [ $# -ne 1 ]; then red "Usage: pdfcut-last " >&2 return 1 fi ORIGINAL=$(path2File "$1") TARGET=$(mktemp /tmp/XXXXXX.pdf) # Get the total number of pages PAGES=$(pdftk "$ORIGINAL" dump_data | grep NumberOfPages | awk '{print $2}') # If the PDF has only one page, we can't remove the last page if [ "$PAGES" -eq 1 ]; then red "Error: The PDF has only one page. Cannot remove the last page." >&2 return 1 fi # Remove the last page pdftk "$ORIGINAL" cat 1-r2 output "$TARGET" && # Copy metadata to the new PDF exiftool -TagsFromFile "$ORIGINAL" "$TARGET" && mv "$TARGET" "$ORIGINAL" || rm "$TARGET" (crossref "$ORIGINAL" &) } # sometimes we want to keep the first/cover page, but still don't want to actually make it the *first* page (or work around this with the `#p[age=2` trick; so we can just rotate it to the end rather than deleting it entirely. pdfcut-append () { if [ $# -ne 1 ]; then red "Wrong number of arguments arguments; 'pdfcut-append' moves the first page to the end. To delete the first page, use 'pdfcut'." >&2 && return 1; fi ORIGINAL=$(path2File "$@") TARGET=$(mktemp /tmp/XXXXXX.pdf); pdftk "$ORIGINAL" cat 2-end 1 output "$TARGET" && # pdftk by default erases all metadata, so we need to copy it all to the new PDF: exiftool -TagsFromFile "$ORIGINAL" "$TARGET" && mv "$TARGET" "$ORIGINAL" || rm "$TARGET"; (crossref "$ORIGINAL" &); } # concatenate a set of PDFs, and preserve the metadata of the first PDF; this is useful for combining a paper with its supplement or other related documents, while not erasing the metadata the way naive `pdftk` concatenation would: # This accepts Docx files as well due to their frequency in supplemental files, so `pdf-append foo.pdf supplement-1.doc supplement-2.pdf` is allowed (Docx is converted to PDF by `doc2pdf`). # (The appended PDFs are soft-deleted by default, by moving them to the used temporary directory, which is not removed afterwards. In case of a rare problem, they can be retrieved from there.) pdf-append () { if [ $# -lt 2 ]; then red "Not enough arguments" >&2 && return 1; fi ORIGINAL=$(path2File "$1") TARGET=$(mktemp /tmp/XXXXXX.pdf) TEMP_DIR=$(mktemp -d) # Convert non-PDF files to PDF using doc2pdf PDF_FILES=() # Store the files that will need to be moved to temp dir (all except first) FILES_TO_MOVE=("${@:2}") for file in "$@"; do if [[ "$file" =~ \.(pdf|PDF)$ ]]; then PDF_FILES+=("$file") elif [[ "$file" =~ \.(doc|docx|odt)$ ]]; then output_pdf="$TEMP_DIR/$(basename "${file%.*}").pdf" doc2pdf "$file" "$output_pdf" PDF_FILES+=("$output_pdf") else red "Skipping unsupported file: $file" >&2 fi done # Concatenate PDFs if pdftk "${PDF_FILES[@]}" cat output "$TARGET"; then # If concatenation successful, preserve metadata from the first PDF if exiftool -TagsFromFile "$ORIGINAL" "$TARGET" && mv "$TARGET" "$ORIGINAL"; then # Move the original files (excluding the first one) to temp directory for file in "${FILES_TO_MOVE[@]}"; do if [ -f "$file" ]; then mv "$file" "$TEMP_DIR/" bold "Moved $file to $TEMP_DIR" >&2 fi done # List contents of temp directory bold "Contents of temporary directory:" >&2 ls -l "$TEMP_DIR" >&2 return 0 else red "Error preserving metadata or moving target file" >&2 rm -f "$TARGET" return 1 fi else red "PDF concatenation failed" >&2 rm -f "$TARGET" return 1 fi } doc2pdf () { [ $# -eq 0 ] && { red "Usage: doc2pdf [output_file]"; return 1; } input="$1" output="${2:-${input%.*}.pdf}" [[ ! -s "$input" || ! "$input" =~ \.(doc|docx|odt)$ ]] && { red "Error: Invalid or empty file: $input"; return 1; } soffice --convert-to pdf "$input" --headless --outdir "$(dirname "$output")" [ "$#" -eq 2 ] && mv "${input%.*}.pdf" "$output" } # trim whitespace from around JPG/PNG images crop_one () { if [[ "$*" =~ .*\.(jpg|png) ]]; then nice convert "$(path2File "$@")" -crop "$(nice -n 19 ionice -c 3 convert "$@" -virtual-pixel edge -blur 0x5 -fuzz 1% -trim -format '%wx%h%O' info:)" +repage "$@"; fi } crop () { export -f crop_one; ls "$(path2File "$@")" | parallel crop_one; } # WARNING: if 'export' isn't inside the function call, it breaks 'atd'! no idea why. may be connected to Shellshock. export -f crop crop_one alias invert="mogrify -negate" # report to InvertOrNot.com one image URL that incorrectly inverts/invert-nots for dark-mode: # `$ invert-error-report https://gwern.net/doc/math/2024-zhang-figure1-overfittingofmodelfamiliestogsm8k.png` invert-error-report () { curl --request 'POST' 'https://invertornot.com/api/correction' \ --header 'accept: application/json' --header 'Content-Type: application/json' \ --data '["'"$1"'"]'; } # add white pixels to an image which has been cropped too tightly to look good: pad () { for FILE in "$@"; do mogrify -bordercolor white -border 30 "$(path2File "$FILE")" done } pad-black () { for FILE in "$@"; do mogrify -bordercolor black -border 30 "$(path2File "$FILE")" done } crop-pad () { crop "$@" && pad "$@"; } crop-pad-black () { crop "$@" && pad-black "$@"; } # function split_image () { local image_path="$1"; local base_name=$(basename "$image_path" .png); local height=$(identify -format "%h" "$image_path"); local half_height=$((height / 2)) convert "$image_path" -crop 100%x50%+0+0 "${base_name}-1.png"; convert "$image_path" -crop 100%x50%+0+$half_height "${base_name}-2.png"; } # convert black background to white: `mogrify -fuzz 5% -fill white -draw "color 0,0 floodfill"` # check a PNG to see if it can be turned into a JPG with minimal quality loss (according to the ImageMagick PSNR perceptual loss); for PNGs that should be JPGs, often the JPG will be a third the size or less, which (particularly for large images like sample-grids) makes for more pleasant web browsing. # (This does not rename or convert as it is still experimental and automatically renaming gwern.net files is dangerous due to the need for global search-and-replace & defining nginx redirects.) png2JPGQualityCheck () { for ARGS in "$@"; do ARG=$(path2File "$ARGS") QUALITY_THRESHOLD=31 # decibels SIZE_REDUCTION_THRESHOLD=30 # % TMP_DIR="${TMPDIR:-/tmp}" # Use TMPDIR if set, otherwise default to /tmp # JPG_BASENAME="$(basename "${ARG%.png}.jpg")" JPG="$(mktemp "$TMP_DIR/XXXXXX.jpg")" # Convert PNG to JPG at 15% quality (to ensure any artifacts show up clearly): convert "$ARG" -quality 15% "$JPG" # Calculate file sizes PNG_SIZE=$(stat -c%s "$ARG") JPG_SIZE=$(stat -c%s "$JPG") # Calculate size reduction in percentage SIZE_REDUCTION=$(echo "scale=2; (1 - $JPG_SIZE / $PNG_SIZE) * 100" | bc) # Calculate PSNR PSNR=$(compare -metric PSNR "$ARG" "$JPG" null: 2>&1 | cut --delimiter=' ' --fields=1) # Check both PSNR quality and size reduction if (( $(echo "$PSNR > $QUALITY_THRESHOLD" | bc -l) )) && (( $(echo "$SIZE_REDUCTION >= $SIZE_REDUCTION_THRESHOLD" | bc -l) )); then echo "$ARG" fi # Clean up: Remove the temporary JPG file rm "$JPG" done } export -f png2JPGQualityCheck # crossref: defined in ~/wiki/static/build/crossref cr () { crossref "$@" & } # PDF cleanup: strip encryption, run through `pdftk` to render them standard & strip out weirdness, reformat titles. e () { FILE="" # did I write '$ e -Title="Foobar" 2000-baz.pdf' or '$ e 2000-baz.pdf -Title="Foobar"'? if [[ "${1:0:1}" == "-" ]]; then FILE="${*: -1}" else FILE=$(path2File "$1") shift; fi if [[ -a "$FILE" ]]; then if [[ "$FILE" =~ .*\.pdf || "$FILE" =~ .*\.jpg || "$FILE" =~ .*\.png ]]; then if [[ "$FILE" =~ .*\.pdf ]]; then ENCRYPTION=$(exiftool -q -q -Encryption "$FILE") if [ "$ENCRYPTION" != "" ]; then pdftk "$FILE" input_pw output foo.pdf && mv foo.pdf "$FILE"; fi; # Sometimes PDFs are really weird and exiftool will claim to successfully write metadata to them # and yet nothing will happen. I've found that if you pass them through pdftk first, they seem almost identical # but the mysterious unwritable behavior stops. So we convert any PDF by default. TMP="$(mktemp /tmp/XXXXX.pdf)" # pdftk by default erases all metadata, so we need to copy it all to the new PDF: pdftk "$FILE" cat output "$TMP" && \ exiftool -TagsFromFile "$FILE" "$TMP" && \ mv "$TMP" "$FILE" fi; echo "$FILE" exiftool -m -overwrite_original "$FILE" "$@"; # getting very tired of these hyphen junk in my titles... TITLE1="$(exiftool -printFormat '$Title' -Title "$FILE")" TITLE2="$(echo "$TITLE1" | sed -e 's/‐/-/g' -e 's/^ \+//g' -e 's/ \+$//g' -e 's/\.$//' | tr '_' ':')" # WARNING: tr mangles Unicode, but sed doesn't if [[ "$TITLE1" != "$TITLE2" ]]; then exiftool -overwrite_original -Title="$TITLE2" "$FILE"; fi else emacsclient "$FILE"; fi; else red "File does not exist? $FILE" fi; } alias ea="exiftool -All" alias exiftool="exiftool -overwrite_original" # Gwern.net searches: ## fixed-grep gwern.net specifically: gw () { if [ $# == 0 ]; then red "Missing search query." >&2 && return 2; fi QUERY=$(echo "$*" | tr -d '\n') # remove newlines, which are usually spurious (and also not supported by grep by default?) thanks to browsers injecting newlines everywhere when copy-pasting... RESULTS=$( (find ~/wiki/ -type f -name "*.md"; ls ~/.emacs ~/*.md; find ~/wiki/metadata/ ~/wiki/haskell/ -name "*.hs" -or -name "*.gtx"; find ~/wiki/static/ -type f -name "*.js" -or -name "*.css" -or -name "*.hs" -or -name "*.conf" -or -name "*.gtx" -or -name "*.py" -or -name "*.sh"; find ~/wiki/ -type f -name "*.html" -not -wholename "*/doc/*" ) | \ grep --fixed-strings -v -e '.#' -e 'auto.hs' -e doc/link-bibliography/ -e metadata/annotation/ -e _site/ -e _cache/ | sort --unique | \ xargs grep --fixed-strings --color=always --ignore-case --with-filename -- "$QUERY" | cut -c 1-2548); if [ -z "$RESULTS" ]; then gwl "$QUERY" # fall back to double-checking IRC logs else echo "$RESULTS" fi } ## file names only gwf () { (cd ~/wiki/ && find . -type f | grep --fixed-strings -v -e '.#' -e '_cache/' -e '_site/' -e '.git/' | grep --ignore-case "$@" | sed -e 's/^\.\//\//g') | sort; } # path2File? ## Newsletter only: gwn () { if [ $# != 1 ]; then QUERY="$*"; else QUERY="$@"; fi find ~/wiki/newsletter/ -type f -name "*.md" | \ grep --fixed-strings -v -e '.#' | sort --unique | xargs grep --extended-regexp --ignore-case --color=always --with-filename "$QUERY" | cut --characters 1-2048; } ## Annotations: # gwa: defined in ~/wiki/static/build/gwa gwal () { gwa "$@" | less -p "$@"; } ## #lesswrong IRC logs gwl () { if [ $# != 1 ]; then QUERY="$*"; else QUERY="$@"; fi grep --extended-regexp --context=1 --text --ignore-case -- "$QUERY" ~/doc/irclogs/*/\#lesswrong.log; } # Gwern.net modifications: ## gwsed shortcut for the common use case of updating a domain HTTP → HTTPS; typically the URLs are otherwise unchanged, and don't need to be individually updated. gwhttp () { if [ $# != 1 ]; then echo "HTTP migration only works for 1 domain, did you try a regular search-and-replace?" >&2 return 2 else # extract top-level domain to do the search-and-replace on that: HTTP="http://$(echo "$1" | sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/')" HTTPS=${HTTP//http:/https:} gwsed "$HTTP" "$HTTPS" fi } export -f gwhttp ## Move a file and sync it and update all callers: ## TODO: handle pages/essays, not just files? see for a sketch. gwmv () { # shortcut: if there's only 1 PNG argument & it's a known wiki file (ie. version-controlled in git), then that means we are converting it to JPG using `png2JPGQualityCheck`, and can skip manually setting the second argument to the `.jpg` extension. # (The converse currently never happens & so is unsupported.) if [[ $# -eq 1 && $1 =~ \.png$ && $(cd ~/wiki/ && git ls-files --error-unmatch ./"$1" 2>/dev/null) ]]; then gwmv "$1" "${1%.png}.jpg" else if [ $# != 2 ]; then red "Need two arguments: OLD file and NEW file! Only got: $@" >&2 return 2 else set -x if [[ ! $(pwd) =~ "/home/gwern/wiki/".* ]]; then cd ~/wiki/ ; fi OLD=$(echo "$1" | tr -d '  ⁠' | sed -e 's/https:\/\/gwern\.net//g' -e 's/^\///g' | xargs realpath | sed -e 's/\/home\/gwern\/wiki\//\//g' ) NEW=$(echo "$2" | tr -d ' ⁠ ' | sed -e 's/https:\/\/gwern\.net//g' -e 's/^\///g' | xargs realpath | sed -e 's/\/home\/gwern\/wiki\//\//g') if [[ "$NEW" == "" ]]; then red "Processing arguments failed, exiting immediately!" >&2 ; echo "$OLD" "$NEW" >&2 ; return 8; fi # Check if the parent directory of the NEW path exists NEW_DIR=$(dirname "$HOME/wiki$NEW") if [ ! -d "$NEW_DIR" ]; then red "Target directory $NEW_DIR does not exist. Operation aborted." >&2 return 7 fi if [ -d "$HOME/wiki$OLD" ] || [ -d "${OLD:1}" ]; then red "The first argument ($1 $OLD) is a directory. Please use 'gwmvdir' to rename entire directories." >&2 return 3 elif [ ! -f "$HOME/wiki$OLD" ] && [ ! -f "${OLD:1}" ]; then red "File $OLD not found in current directory or ~/wiki" >&2 return 4 fi # strip any slash prefix like '/doc/psychology/2021-foo.pdf' → 'doc/psychology/2021-foo.pdf' to turn it into a local relative link # eg "gwMv 'doc/psychology/2021-foo.pdf' 'doc/psychology/writing'" as a shortcut for "gwmv 'doc/psychology/2021-foo.pdf' 'doc/psychology/writing/2021-foo.pdf'" if [ -d ~/wiki$NEW ]; then NEW=$(echo $NEW/$(basename $OLD) | sed -e 's/\/\//\//'); fi cd ~/wiki/ if [[ -a ~/wiki$NEW ]]; then red "Moved-to target file $NEW exists! Will not move $OLD and overwrite it. If you deliberately want to overwrite $NEW, then explicitly delete it first." >&2 return 5 fi # Check if OLD is a PNG and NEW is a JPG for conversion if [[ "$OLD" =~ \.png$ && "$NEW" =~ \.jpg$ ]]; then # preserve the git history by stashing the converted JPG, doing a `git mv` to tell git about the file renaming, and then overwriting the 'JPG' (actually the original PNG) with an actual JPG TMP="$(mktemp /tmp/XXXXX.jpg)" convert "$HOME/wiki$OLD" "$TMP" compressJPG2 "$TMP" git mv "$HOME/wiki$OLD" "$HOME/wiki${NEW%.jpg}.jpg" mv "$TMP" "$HOME/wiki${NEW%.jpg}.jpg" elif [[ -a ~/wiki$OLD ]]; then touch ~/wiki"$NEW" && rm ~/wiki"$NEW" && git mv ~/wiki"$OLD" ~/wiki"$NEW" || return 3 else red "File does not exist? $OLD (to be moved to $NEW)" >&2 && return 1; fi rsync --mkpath --chmod='a+r' -q ~/wiki"$NEW" gwern@176.9.41.242:"/home/gwern/gwern.net$NEW" || red "gwmv: rsync failed?" > /dev/null & gwsed "$OLD" "$NEW" echo '"~^'"$OLD"'.*$" "'"$NEW"'";' | tee --append ~/wiki/static/redirect/nginx.conf # 3. add a redirected old to nginx # 4. delete outdated annotations: OLD_FILE=$(basename "$OLD"); rm "$HOME/wiki/metadata/annotation/*$OLD_FILE*" || true > /dev/null wait set +x fi fi } ## Move a directory: gwmvdir () { cd ~/wiki/ OLD="$1" NEW="$2" if [[ "$OLD" == /* ]]; then OLD="${OLD:1}" fi if [[ "$NEW" == /* ]]; then NEW="${NEW:1}" fi OLD="$(readlink --canonicalize "$OLD" | cut -d '/' -f 5-)" ls "$OLD" NEW="$(readlink --canonicalize "$NEW" | cut -d '/' -f 5-)" mkdir -p "$NEW" git add "$NEW" for FILE in $(ls "$OLD"/* | grep --fixed-strings -v 'index.md'); do echo $FILE gwmv "$FILE" "$NEW/$(basename $FILE)" done rm "$OLD/index.md" || true rmdir "$OLD" } ## 'Move' or tag an annotation: ## gwtag URL [tag] ## eg. `gwtag https://foo iq psychology sociology` alias gwt="gwtag" alias t="gwtag" gwtag () { ( wait; # just in case another tool might be running (eg. gwtag or gwsed) cd ~/wiki/ && # echo "---" && grep --fixed-strings -- "$1" ./metadata/*.gtx || true timeout 20m nice ./static/build/changeTag +RTS -N2 -RTS "$@"; echo "" # && # echo "---" && grep --fixed-strings -- "$1" ./metadata/*.gtx ); } # eg. `"ai ai/anime ai/anime/danbooru ... ai/scaling ai/scaling/economics ... japan/poetry/shotetsu japan/poetry/teika ... technology/digital-antiquarian ... zeo/short-sleeper"` GWERNNET_DIRS_FULL="$(cd ~/ && find wiki/doc/ -type d | grep --fixed-strings -v -e 'doc/rotten.com' -e 'doc/www/' \ -e 2000-iapac-norvir -e mountimprobable.com -e personal/2011-gwern-yourmorals.org \ -e psychology/european-journal-of-parapsychology -e reinforcement-learning/armstrong-controlproblem \ -e statistics/order/beanmachine-multistage -e gwern.net-gitstats -e metadata/annotation | \ cut --delimiter='/' --fields=3- | sort)" # eg. `"1 2 2010-crc 2013-cicadas 3 4 5 abandoned-footnotes ab-testing ... www zeami zeo"` GWERNNET_DIRS_SHORT="$(echo $GWERNNET_DIRS_FULL | tr '/' '\n' | tr ' ' '\n' | sort --unique)" # for completing tags which may need to be disambiguated, like 'gpt/nonfiction': # eg. `"1/lsd 2/2010-crc 3/fiction 3/nonfiction ... palm/2 personality/conscientiousness personality/psychopathy ... video/analysis video/generation vision/dream"` GWERNNET_DIRS_SUFFIXES="$(echo $GWERNNET_DIRS_FULL | tr ' ' '\n' | grep --extended-regexp -e '[a-z0-9-]+/[a-z0-9-]+/[a-z0-9-]+' | \ rev | cut --delimiter='/' --fields=1-2 | rev | sort --unique)" complete -W "$GWERNNET_DIRS_FULL $GWERNNET_DIRS_SHORT $GWERNNET_DIRS_SUFFIXES" u gwtag gwt t alias u="upload" # 'upload' moved to ~/wiki/static/build/upload for easier calling from XMonad ## tab-complete the first argument as the local file, and the second argument as the remote directory: _upload () { local cur cword _init_completion || return ## assumes `bash-completion` package is installed & sourced previously if [[ $cword -eq 1 ]]; then # File completion on first argument _filedir elif [[ $cword -eq 2 ]]; then # Directory completion on second argument COMPREPLY=( $(compgen -W "${GWERNNET_DIRS_FULL} ${GWERNNET_DIRS_SHORT} ${GWERNNET_DIRS_SUFFIXES}" -- "$cur") ) fi } complete -F _upload upload # wait for a file (first argument) to become quiescent because eg. the web browser is still downloading it; optional second argument is the minimum size in kilobytes (eg. archived webpages are almost always >10kb these days, even if it's an old-school all-text HTML page): is_downloading () { file="$1" min_size_kb="${2:-0}" # Default to 0 if not provided min_size_bytes=$((min_size_kb * 1024)) current_time=$(date +%s) # Check if the file exists and is non-zero in size if [ -s "$file" ]; then file_size=$(stat -c %s "$file") modified_time=$(stat -c %Y "$file") elapsed_time=$((current_time - modified_time)) # Check if file size meets the minimum requirement if [ "$file_size" -lt "$min_size_bytes" ]; then sleep 3 is_downloading "$file" "$min_size_kb" return fi # Sleep if last-modified time is not at least 3 seconds ago if [ $elapsed_time -lt 3 ]; then sleep $((3 - elapsed_time)) is_downloading "$file" "$min_size_kb" fi else # File doesn't exist or is zero-sized, wait and check again sleep 3 is_downloading "$file" "$min_size_kb" fi } # shortcut for handling link-archiving review: # `mvuri`: takes a filename with a URI encoding # like `file:///home/gwern/wiki/doc/www/www.patterns.app/d7aaf7b7491492af22c98dae1079fbfa93961b5b.html` # and transform that argument into `/home/gwern/wiki/doc/www/www.patterns.app/d7aaf7b7491492af22c98dae1079fbfa93961b5b.html` # (ignoring any anchor/hash on the original .html file) and then `mv` the URL snapshot to that like normal. # If the target starts with 'doc/' (perhaps because it has been copied from `/metadata/archive.hs` while manually fixing) # It is treated as if it had started with `file:///home/gwern/wiki/doc/` # # eg. `$ mvuri file:///home/gwern/wiki/doc/www/www.patterns.app/d7aaf7b7491492af22c98dae1079fbfa93961b5b.html` # `$ mvuri file:///home/gwern/wiki/doc/www/www.patterns.app/d7aaf7b7491492af22c98dae1079fbfa93961b5b.html#toc` # `$ mvuri doc/www/www.patterns.app/d7aaf7b7491492af22c98dae1079fbfa93961b5b.html` # are all equivalent. mvuri () { # Check if inotifywait is installed if ! command -v inotifywait &> /dev/null; then red "inotifywait could not be found. Please install 'inotify-tools' package." exit fi local ENCODED_PATH="$1" local DECODED_PATH="${ENCODED_PATH//\%/\\x}" DECODED_PATH="${DECODED_PATH#file://}" DECODED_PATH="${DECODED_PATH%%#*}" # ignore anchors like `foo.html#id` by stripping them # Check if DECODED_PATH starts with 'doc/', if so prepend with '/home/gwern/wiki/' if [[ $DECODED_PATH == doc/* ]]; then DECODED_PATH="/home/gwern/wiki/$DECODED_PATH" fi local DESTINATION="$DECODED_PATH" if [ ! -f "$DESTINATION" ]; then red "WARNING: destination target $DESTINATION does not exist!"; fi local SOURCE # the browser may not have yet begun writing the file, so make sure we wait until it does: while true; do # we target the oldest .html file, in case others have started being created in the mean time: SOURCE="$(find ~/ -maxdepth 1 -name "*.html" -printf '%T+ %p\n' | sort --key=1,1 | cut --delimiter=' ' --fields=2- | head --lines=1)" if [ -n "$SOURCE" ]; then break else inotifywait --quiet --event create ~/ fi done # the file may not yet be fully written, so we additionally wait until it's (probably) complete: is_downloading "$SOURCE" 10 # most web pages <10kb are broken or error pages bold "$SOURCE" "$(du -ch "$SOURCE")" "$DESTINATION" if [[ $(stat -c%s "$SOURCE") -ge 10000000 ]]; then # EXPERIMENTAL: optimize large HTML files (>10MB) by splitting back into separate files to avoid strict downloads mv "$SOURCE" "$DESTINATION" php ~/wiki/static/build/deconstruct_singlefile.php "$DESTINATION" else mv "$SOURCE" "$DESTINATION" fi echo -n -e '\a'; # ring bell because done } # everyNDays: returns a boolean true every _N_ whole-number days in a systematic way based on date modulus; can be used in place of `$RANDOM` calls to do an operation 'randomly' every once in a while, by calling `everyNDays N && ...` or `if everyNDays N; then ...; else; ... fi`. # (Better than actual randomness because it avoids clumping/starvation, or potentially doing all of the operations on the same run by chance; and far simpler than any explicit tracking of state/date.) # The optional second whole-number argument is an 'offset' to avoid clumping of multiple calls with the same _N_; they can be offset by relatively-prime numbers or just plain incremented. everyNDays () { (( (($(date +%j) + ${2:-0}) % $1) == 0 )) # optional offset second argument to stagger or space out multiple calls of _N_ relative to each other } # sort a list of directories from stdin by their most recently modified file. # (We ignore any file not tracked by git, because it may be auto-generated or modified spuriously. # We treating sub-directories as separate directories, so a directory with a modified file doesn't trigger all its parents as well). # # This is useful for prioritizing commands, particularly if they may crash on an error: # errors (eg. broken links or malformed syntax) will usually be in the most-recently-modified files, # so processing them first reduces latency to fix. sort_by_lastmodified () { local git_files latest_file timestamp while read -r dir; do # Change to the directory pushd "$dir" >/dev/null || continue # Get list of files not ignored by git git_files=$(git ls-files --exclude-standard) if [ -n "$git_files" ]; then # Find the most recently modified file among git-tracked files latest_file=$(git ls-files --exclude-standard -z | xargs -0 stat -f '%m %N' 2>/dev/null | sort --reverse --numeric | head --lines=1) else # If no git-tracked files, find the most recently modified untracked file latest_file=$(find . -type f -not -path './.git/*' -printf '%T@ %p\n' 2>/dev/null | sort --reverse --numeric | head --lines=1) fi if [ -n "$latest_file" ]; then # Extract the timestamp and print it with the directory timestamp=$(echo "$latest_file" | cut --delimiter=' ' --fields=1) echo "$timestamp $dir" else # If no files found, use the directory's modification time timestamp=$(stat -f '%m' . 2>/dev/null || stat -c '%Y' .) echo "$timestamp $dir" fi # Return to the original directory popd >/dev/null done | sort --reverse --numeric | cut --delimiter=' ' --fields=2- } # KNOWN GOOD / GOLD UNIT-TESTING (experimental) # the pages exercise most site functionality, and many errors or regressions in new patches should show up as changes in the generated HTML. # So we can simply compare the new HTML to old stored snapshots to inspect any changes (`run_regression_test`), and if the changes are good, update the snapshots (`lorem_update`). SITE_URL="https://gwern.net" SNAPSHOT_DIR="./metadata/snapshot" CONTENT_DIR="$HOME/wiki/" get_lorem_pages () { find "$CONTENT_DIR" -name "lorem*.md" -printf "%f\n" | sed 's/\.md$//' | sort } lorem_download () { local page=$1 local output_file="${SNAPSHOT_DIR}/${page}" local temp_file; temp_file=$(mktemp) # we version the infrastructure files with a 10-digit seconds-level Unix timestamp to bust caches, and those will change every time those are modified, eg. to add a link-icon, and trigger spurious changes. So we used sed to remove those from both downloaded snapshots & downloaded live pages. curl --silent "${SITE_URL}/${page}" | sed 's/\.\(css\|js\|svg\)?v=[0-9]\{10\}/.\1/g' >> "$temp_file" mv "$temp_file" "${output_file}" } compare_page () { local page=$1 local snapshot_file="${SNAPSHOT_DIR}/${page}" local temp_file; temp_file=$(mktemp) curl --silent "${SITE_URL}/${page}" | sed 's/\.\(css\|js\|svg\)?v=[0-9]\{10\}/.\1/g' >> "${temp_file}" if ! diff --ignore-space-change --brief "${temp_file}" "${snapshot_file}" >/dev/null; then red "Changes detected in \"${page}\":" # Git: "If exactly two paths are given and at least one points outside the current repository, git diff will compare the two files / directories."; this works better for diffing HTML than any invocations of `diff` I found. git diff --color-words "${snapshot_file}" "${temp_file}" fi rm "${temp_file}" } lorem_update () { bold "Updating snapshots…" mkdir -p "${SNAPSHOT_DIR}" get_lorem_pages | while read -r page; do lorem_download "${page}" bold "Updated snapshot for \"${page}\"" done } run_gold_test () { get_lorem_pages | while read -r page; do compare_page "${page}" done }