#!/bin/bash
# download-title.sh
# Author: Gwern Branwen
# Date: 2024-06-10
# When: Time-stamp: "2024-06-30 17:44:06 gwern"
# License: CC-0
#
# download-title.sh: download a URL, and if it is a parseable HTML page, print out the contents of the title tag, `
TITLE`.
#
# $ ./download-title.sh "http://catb.org/~esr/writings/taoup/html/ch05s01.html"
# The Importance of Being Textual
# $ ./download-title.sh https://blog.nationalmuseum.ch/en/2024/06/the-dream-of-an-alpine-waterway/
# The dream of an alpine waterway – Swiss National Museum - Swiss history blog
#
# dependencies: curl, xmllint, timeout, file, iconv, perl
set -euo pipefail
if [ $# -lt 1 ]; then echo "download-title.sh: Error: Not enough arguments" && exit 1; fi
# make sure xmllint is available for HTML parsing:
if ! command -v xmllint > /dev/null; then
echo "download-title.sh: Error: xmllint is not installed. Please install the 'libxml2-utils'/'libxml2' package"
exit 2
fi
# Function to extract the title from HTML
extract_title() {
local URL="$1"
local TEMP_FILE
TEMP_FILE=$(mktemp) || { echo "Failed to create temp file" >&2; return 1; }
# Fetch the URL content and save to a temporary file
timeout 20s curl --silent -L "$URL" > "$TEMP_FILE"
# Use perl to decode the HTML entities and extract the title
perl -MHTML::Entities -MHTML::TreeBuilder -e '
my $tree = HTML::TreeBuilder->new;
$tree->parse_file(shift);
my $title = $tree->look_down(_tag => "title");
if ($title) {
print decode_entities($title->as_text);
}
$tree->delete;
' "$TEMP_FILE" | tr -d '\n' | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//'
# Clean up
rm "$TEMP_FILE"
}
for PAGE in "$@"; do
extract_title "$PAGE"
done