#!/bin/bash
# download-title.sh
# Author: Gwern Branwen
# Date: 2024-06-10
# When: Time-stamp: "2024-06-10 21:07:12 gwern"
# License: CC-0
#
# download-title.sh: download a URL, and if it is a parseable HTML page, print out the contents of the title tag, `
TITLE`.
#
# $ ./download-title.sh "http://catb.org/~esr/writings/taoup/html/ch05s01.html"
# The Importance of Being Textual
#
# dependencies: curl, iconv, xmllint, timeout
set -e
if [ $# -lt 1 ]; then echo "download-title.sh: Error: Not enough arguments" && exit 1; fi
# make sure xmllint is available for HTML parsing:
if ! command -v xmllint > /dev/null; then
echo "download-title.sh: Error: xmllint is not installed. Please install the 'libxml2-utils'/'libxml2' package"
exit 2
fi
# Function to extract the title from HTML
extract_title() {
local URL="$1"
local CONTENT_TYPE
# Fetch the URL headers to check the content type
CONTENT_TYPE=$(timeout 20s curl --silent --head "$URL" | grep --ignore-case "Content-Type:")
# Check if the content type is HTML
if echo "$CONTENT_TYPE" | grep --quiet --ignore-case "text/html"; then
# Fetch the URL content and extract the title
timeout 20s curl --silent "$URL" | iconv -c --to-code=utf8 | \
xmllint --html --xpath '//title/text()' - 2>/dev/null
fi
}
for PAGE in "$@"; do
extract_title "$PAGE"
done