{- LinkMetadata: module for generating Pandoc links which are annotated with metadata, which can
                    then be displayed to the user as 'popups' by /static/js/popups.js. These popups can be excerpts,
                    abstracts, article introductions etc., and make life much more pleasant for the reader—hover over
                    link, popup, read, decide whether to go to link.
Author: Gwern Branwen
Date: 2019-08-20
When:  Time-stamp: "2026-05-10 22:34:06 gwern"
License: CC-0
-}

{-# LANGUAGE OverloadedStrings #-}
module LinkMetadata (addPageLinkWalk, isPagePath, readLinkMetadata, readLinkMetadataSlow, readLinkMetadataAndCheck, walkAndUpdateLinkMetadata, walkAndUpdateLinkMetadataGTX, updateGwernEntries, writeAnnotationFragments, Metadata, MetadataItem, MetadataList, readGTXFast, writeGTX, annotateLink, createAnnotations, hasAnnotation, hasAnnotationOrIDInline, generateAnnotationTransclusionBlock, authorsToCite, cleanAbstractsHTML, sortItemDate, sortItemPathDate, sortItemPathDateModified, sortItemDateModified, sortByDateModified, sortByDatePublished, lookupFallback, sortItemPathDateCreated, addCanPrefetch, annotationSizeDB, generateFileTransclusionBlock) where

import Control.Monad (unless, void, when, foldM_, (<=<))

import Data.Char (isPunctuation, isNumber)
import Data.Maybe (fromMaybe)
import qualified Data.Map.Strict as M (elems, empty, filter, filterWithKey, fromList, fromListWith, keys, toList, lookup, map, union, size, member, keysSet, Map) -- traverseWithKey, union, Map
import qualified Data.Set as Set (member, null)
import qualified Data.Text as T (append, isInfixOf, isPrefixOf, pack, unpack, replace, Text, unlines)
import Data.Containers.ListUtils (nubOrd)
import Data.Function (on)
import Data.List (intersect, isInfixOf, isPrefixOf, isSuffixOf, sort, sortBy, (\\))
import Data.List.HT (search) -- utility-ht
import Network.HTTP (urlEncode)
import Network.URI (isURIReference)
import System.Directory (doesFileExist, doesDirectoryExist, getFileSize)
import System.FilePath (takeDirectory, takeFileName, takeExtension)
import Text.Pandoc (Inline(Link, Span),
                    writeHtml5String, runPure,
                    nullAttr, nullMeta, Attr, Target,
                    Inline(Code, Image, Str, RawInline, Space, Strong), Pandoc(..), Format(..), Block(RawBlock, Para, BlockQuote, Div))
import Text.Pandoc.Walk (walk, walkM)
import Text.Show.Pretty (ppShow)

import qualified Control.Monad.Parallel as Par (mapM_, mapM) -- monad-parallel

import System.IO.Unsafe (unsafePerformIO)

import Inflation (nominalToRealInflationAdjuster, nominalToRealInflationAdjusterHTML, isInflationURL)
import Interwiki (convertInterwikiLinks, isWPAPI)
import Typography (titlecase', typesetHtmlField, titleWrap)
import Image (addImgDimensions, imageLinkHeightWidthSet, isImageFilename, isVideoFilename)
import LinkArchive (localizeLink, ArchiveMetadata, localizeLinkURL, calculateArchiveSizePercentiles)
import LinkBacklink (getSimilarLinkCheck, getSimilarLinkCount, getBackLinkCount, getBackLinkCheck, getLinkBibLinkCheck, getAnnotationLink)
import LinkID (authorsToCite, generateID, getDisambiguatedPairs)
import LinkLive (linkLive, alreadyLive, linkLiveString)
import LinkMetadataTypes (Metadata, MetadataItem, Path, MetadataList, Failure(Temporary, Permanent), isPagePath, hasHTMLSubstitute, SizeDB)
import Query (extractLinksInlines)
import Tags (listTagsAll, tagsToLinksSpan)
import Metadata.Format (processDOI, cleanAbstractsHTML, linkCanonicalize, balanced) -- authorsInitialize,
import Metadata.Date (dateTruncateBad, isDate)
import Utils (writeUpdatedFile, printGreen, printRed, anyInfix, anyPrefix, anySuffix, replace, anyPrefixT, hasAny, safeHtmlWriterOptions, addClass, hasClass, parseRawAllClean, hasExtensionS, isLocal, kvDOI, delete, safeGetFileSize, calculateSizeToPercentileMap, kvLookup)
import Annotation (linkDispatcher)
import Annotation.Gwernnet (gwern)
import LinkIcon (linkIcon)
import GTX (appendLinkMetadata, readGTXFast, readGTXSlow, rewriteLinkMetadata, writeGTX, untupleize)
import Metadata.Author (authorCollapse)

import qualified Config.LinkID as CLID (affiliationAnchors)
import qualified Config.Misc as CM (fileExtensionToEnglish, minFileSizeWarning, minimumAnnotationLength, currentMonthAgo, todayDayString, currentYear, gtxKeyValueKeyNames)
import qualified Config.Metadata.Author as CA (authorLinkDB, authorWhitelist)
import qualified Config.LinkMetadata as C (badDOISubstrings, badTitleLeadingChars, badTitleTrailingChars, badAuthorSubstrings, allowedNonHttpURLPrefixes, uriValidationExemptInfixes, ignoredMalformedURLPrefixes, duplicateAffiliationWhitelist, youtubeWatchPrefix, twitterHostPrefix, twitterStatusInfix, wikipediaArticleInfix, documentPreviewableExtensions, codePreviewableExtensions, fileViewableExtensions, futureYearSlack, partialAnnotationIgnoredTagCount, partialAnnotationBacklinkThreshold, partialAnnotationSimilarThreshold, annotationURLWarningLength, annotationURLPreviewLength, missingTitleAbstractMinLength, maxPrefetchBytes, annotationClasses, positiveAnnotationClasses)

-- we have 3 kinds of URLs we can look up sizes for: (1) external URLs which have been locally-archived and have a known fixed on-disk size we can easily obtain; due to the subtleties of splitting HTML files, we outsource that to LinkArchive.calculateArchiveSizePercentiles; (2) local files with extensions like PDFs, which we can simply call `safeGetFileSize`; (3) local *essays* with no extensions, but also no ID/anchor fragment, where we append ".md"; (4) local essays with anchors/IDs, which logically we would parse the Markdown to infer what the 'actual' size is of the specified ID but we will punt and just treat it as if the anchor were not there, by deleting it.
-- TODO: we don't seem to use the percentile information anymore. Should we remove it (and simplify the code/data considerably, and let us remove the span.filesize wrapper too)?
annotationSizeDB :: Metadata -> ArchiveMetadata -> IO SizeDB
annotationSizeDB md am =
  do archiveSizes <- calculateArchiveSizePercentiles am :: IO SizeDB

     let localFiles = (filter (\u -> '.' `elem` u && isLocal (T.pack u)) $ M.keys md) :: [FilePath]
     fileSizesRaw <- Par.mapM (\u -> do let u' = takeWhile (/='#') $ tail u
                                        size <- fromIntegral <$> safeGetFileSize u'
                                        return size
                          ) localFiles
     let fileHistogram = calculateSizeToPercentileMap fileSizesRaw
     let filePercentiles = zipWith (\u s -> (u, (s, fromMaybe 0 $ M.lookup s fileHistogram))) localFiles fileSizesRaw :: [(FilePath, (Int,Int))]
     let fileSizes = M.fromList filePercentiles :: SizeDB

     let localEssays = (filter (\u -> '.' `notElem` u && isLocal (T.pack u)) $ M.keys md) :: [FilePath]
     essaySizesRaw <- Par.mapM (\u -> do let u' = (takeWhile (/='#') $ tail u) ++ ".md"
                                         size <- fromIntegral <$> safeGetFileSize u'
                                         return size
                          ) localEssays
     let essayHistogram = calculateSizeToPercentileMap essaySizesRaw
     let essayPercentiles = zipWith (\u s -> (u, (s, fromMaybe 0 $ M.lookup s essayHistogram))) localEssays essaySizesRaw :: [(FilePath, (Int,Int))]
     let essaySizes = M.fromList essayPercentiles :: SizeDB

     let sizes = essaySizes `M.union` archiveSizes `M.union` fileSizes

     return sizes

filesizeSpan :: SizeDB -> FilePath -> [Inline]
filesizeSpan sdb u =
  case M.lookup u sdb of
    Nothing -> []
    Just (bytes, percentile) ->
      [Span ("", ["filesize"], [])
        [Span ("", ["filesize-bytes"], []) [Str (T.pack $ show bytes)],
         Space,
         Span ("", ["filesize-percentage"], []) [Str (T.pack $ show percentile)]]]

-- Should the current link get a 'G' icon because it's an essay or regular page of some sort?
-- we exclude several directories (doc/, static/) entirely; a Gwern.net page is then any
-- link without a file extension (ie. a '.' in the URL - we guarantee that no Markdown essay has a
-- period inside its URL).
-- Essay/page links get the 'link-page' class.
addPageLinkWalk :: Pandoc -> Pandoc
addPageLinkWalk = walk addPageLink

addPageLink :: Inline -> Inline
addPageLink y@(Link (a,b,c) e (f,g)) = if "link-page" `elem` b || "link-page-not" `elem` b || not (isPagePath f) then y
                                        else Link (a, "link-page" : b, c) e (f, g)
addPageLink x = x

-------------------------------------------------------------------------------------------------------------------------------

-- Run an arbitrary function on the 3 databases to update individual items.
-- For example, to use `processDOIArxiv` to add inferred-DOIs to all Arxiv annotations prior to Arxiv adding official DOIs, one could run a command like:
--
-- > walkAndUpdateLinkMetadata True (\x@(path,(title,author,date,dateModified,doi,tags,abstrct)) -> if not ("https://arxiv.org" `isPrefixOf` path) || (doi /= "") then return x else return (path,(title,author,date,dateModified,processDOIArxiv path,tags,abstrct)))
--
-- To rewrite a tag, eg. 'conscientiousness' → 'psychology/personality/conscientiousness':
--
-- > walkAndUpdateLinkMetadata True (\(path,(title,author,date,dateModified,kvs,tags,abst)) -> return (path,(title,author,date,dateModified,kvs,
--      map (\t -> if t/="conscientiousness" then t else "psychology/personality/conscientiousness") tags,  abst)) )
--
-- To do IO (eg. calling an API):
--
-- > walkAndUpdateLinkMetadata True (\(path,(title,author,date,dateModified,kvs,tags,abst)) ->
-- >  do { abst' <- Paragraph.processParagraphizer md path abst;
-- >       return (path,(title,author,date,dateModified,kvs,tags, abst')) } )
--
-- To add 'Gwern' as an author to all local files which have no author & have "-gwern-" in the filename:
--
-- > walkAndUpdateLinkMetadata True (\x@(u, (t,aut,d,dc,misc,tags,abst)) -> if (head u == '/' && "-gwern-" `isInfixOf` u && aut == "") then return (u,(t,"Gwern",d,dc,misc,tags,abst)) else return x)
--
-- To execute a command read-only over a specific part of the DB, like checking how the title-italicizer works:
--
-- > let f = \x@(path,(title,_,_,_,_,_,_)) -> do { title' <- Annotation.processItalicizer title; Control.Monad.when (title /= title')(print title >> print title'); return x; }
-- > walkAndUpdateLinkMetadataGTX f "metadata/half.gtx"
--
-- To test the date-guesser against ground truths:
--
-- > let f = \x@(path,(_,_,date,_,_,_,_)) -> if (date == "") then return x else (do { dateGuess <- Metadata.Date.guessDateFromString path; Control.Monad.when (not (dateGuess `isPrefixOf` date)) (print (path ++ " was " ++ date ++ "; but guessed: " ++ dateGuess)); return x; })
--
-- To push hardwired link-ID overrides into the respective metadata database annotation entries:
--
-- > let f = \x@(path,(a,b,c,d,e,f,g)) -> return $ case (lookup path CLID.linkIDOverrides) of { Nothing -> x; Just ident -> (path,(a,b,c,d,e++[("id",T.unpack ident)],f,g)) }
-- > walkAndUpdateLinkMetadata True f
--
-- To fix malformed dates:
--
-- > walkAndUpdateLinkMetadata True ( \x@(path,(title,author,date,dateModified,kvs,tags,abst)) -> if (date/="" && not (Metadata.Date.isDate date)) then (do { date' <- Metadata.Date.guessDateFromString date; return (path,(title,author,date',dateModified,kvs,tags,abst)) }) else return x )
-- > walkAndUpdateLinkMetadata True ( \x@(path,(title,author,date,dateModified,kvs,tags,abst)) -> if (date/="" && not (Metadata.Date.isDate date)) then (do { date' <- Metadata.Date.guessDateFromString date; return (path,(title,author,date',dateModified,kvs,tags,abst)) }) else return x )
walkAndUpdateLinkMetadata :: Bool -> ((Path, MetadataItem) -> IO (Path, MetadataItem)) -> IO ()
walkAndUpdateLinkMetadata check f = do walkAndUpdateLinkMetadataGTX f "metadata/me.gtx"
                                       walkAndUpdateLinkMetadataGTX f "metadata/full.gtx"
                                       walkAndUpdateLinkMetadataGTX f "metadata/half.gtx"
                                       walkAndUpdateLinkMetadataGTX f "metadata/auto.gtx"
                                       when check (printGreen "Checking…" >> readLinkMetadataAndCheck >> printGreen "Validated all GTX post-update; exiting.")

walkAndUpdateLinkMetadataGTX :: ((Path, MetadataItem) -> IO (Path, MetadataItem)) -> Path -> IO ()
walkAndUpdateLinkMetadataGTX f file = do db <- readGTXSlow file -- TODO: refactor this to take a list of URLs to update, then I can do it incrementally & avoid the mysterious space leaks
                                         db' <-  mapM f db
                                         writeGTX file db'
                                         printGreen $ "Updated " ++ file

-- This can be run every few months to update abstracts (they generally don't change much).
updateGwernEntries :: IO ()
updateGwernEntries = do -- rescrapeGTX gwernEntries "metadata/me.gtx"
                        -- rescrapeGTX gwernEntries "metadata/full.gtx" -- TODO: why would we try to rescrape me/full.gtx? is there anything we really want to clobber (even assuming we are excluding /blog/?); temporarily commented out to see if anything breaks
                        rescrapeGTX gwernEntries "metadata/half.gtx"
                        rescrapeGTX gwernEntries "metadata/auto.gtx"
                        readLinkMetadataAndCheck >> printGreen "Validated all GTX post-update; exiting…"
  where gwernEntries path = ("/" `isPrefixOf` path || "https://gwern.net" `isPrefixOf` path) && not ("." `isInfixOf` path)

-- eg. to rescrape a specific page from the CLI:
-- `cd ~/wiki/ && ghci -istatic/build/ ./static/build/LinkMetadata.hs -e 'rescrapeGTX (\p -> p == "/fiction/christmas") "metadata/half.gtx"'`
rescrapeGTX :: (Path -> Bool) -> Path -> IO ()
rescrapeGTX filterF gtxpath = do dbl <- readGTXFast gtxpath
                                 let paths = filter filterF $ map fst dbl
                                 foldM_ (rescrapeItem gtxpath) dbl paths

rescrapeItem :: Path -> MetadataList -> Path -> IO MetadataList
rescrapeItem gtx dblist path =
  case lookup path dblist of
   Just old -> do new <- updateGwernEntry (path,old)
                  if (path,old) /= new then do let dblist' = new : filter ((/=) path . fst) dblist
                                               writeGTX gtx dblist'
                                               readGTXFast gtx
                   else return dblist
   Nothing -> return dblist

updateGwernEntry :: (Path, MetadataItem) -> IO (Path, MetadataItem)
updateGwernEntry x@(path,(title,author,date,dc,kvs,tags,_)) = if False then return x -- || not ("index"`isInfixOf` path)
    else do printGreen path
            newEntry <- gwern M.empty path
            case newEntry of
              Left Temporary -> return x
              Left Permanent -> return (path,(title,author,date,dc,kvs,tags,"")) -- zero out the abstract but preserve the other metadata; if we mistakenly scraped a page before and generated a pseudo-abstract, and have fixed that mistake so now it returns an error rather than pseudo-abstract, we want to erase that pseudo-abstract until such time as it returns a 'Right' (a successful real-abstract)
              Right (path', (title',author',date',dc',kvs',_,abstract')) -> return (path', (title',author',date',dc',kvs',tags,abstract'))

-- read the annotation base (no checks, >8× faster)
readLinkMetadata :: IO Metadata
readLinkMetadata = do
             me   <- readGTXFast "metadata/me.gtx"
             full <- readGTXFast "metadata/full.gtx"  -- for hand created definitions, to be saved; since it's handwritten and we need line errors, we use GTX:
             half <- readGTXFast "metadata/half.gtx" -- tagged but not handwritten/cleaned-up
             auto <- readGTXFast "metadata/auto.gtx"    -- auto-generated cached definitions; can be deleted if gone stale
             -- merge the hand-written & auto-generated link annotations, and return:
             let final = M.union (M.fromList me) $ M.union (M.fromList full) $ M.union (M.fromList half) (M.fromList auto) -- left-biased, so 'me overrides ''full' overrides 'half' overrides 'auto'
             return final

readLinkMetadataSlow :: IO Metadata
readLinkMetadataSlow = do
             me   <- readGTXSlow "metadata/me.gtx"
             full <- readGTXSlow "metadata/full.gtx"  -- for hand created definitions, to be saved; since it's handwritten and we need line errors, we use GTX:
             half <- readGTXSlow "metadata/half.gtx" -- tagged but not handwritten/cleaned-up
             auto <- readGTXSlow "metadata/auto.gtx"    -- auto-generated cached definitions; can be deleted if gone stale
             -- merge the hand-written & auto-generated link annotations, and return:
             let final = M.union (M.fromList me) $ M.union (M.fromList full) $ M.union (M.fromList half) (M.fromList auto) -- left-biased, so 'me' overrides 'full' overrides 'half' overrides 'auto'
             return final

-- read the annotation database, and do extensive semantic & syntactic checks for errors/duplicates:
-- TODO: split out into 3 functions at different levels of intensity: 1 full, 1 half, 1 auto and the composition; many of these functions would be better off in Metadata.Format or somewhere
readLinkMetadataAndCheck :: IO Metadata
readLinkMetadataAndCheck = do
             -- for hand created definitions, to be saved; since it's handwritten and we need line errors, we use GTX:
             me      <- readGTXSlow "metadata/me.gtx"
             fullGTX <- readGTXSlow "metadata/full.gtx"
             let full = me ++ fullGTX

             -- Quality checks:
             -- requirements:
             -- - URLs/keys must exist, be unique, and either be a remote URL (starting with 'h') or a local filepath (starting with '/') which exists on disk (auto.gtx may have stale entries, but me/full.gtx should never! This indicates a stale annotation, possibly due to a renamed or accidentally-missing file, which means the annotation can never be used and the true URL/filepath will be missing the hard-earned annotation). We strip http/https because so many websites now redirect and that's an easy way for duplicate annotations to exist.
             -- - titles must exist & be unique (overlapping annotations to pages are disambiguated by adding the section title or some other description)
             -- - authors must exist (if only as 'Anonymous' or 'N/A'), but are non-unique
             -- - dates are non-unique & optional/NA for always-updated things like Wikipedia. If they exist, they should be of the format 'YYYY[-MM[-DD]]'.
             -- - DOIs are optional since they usually don't exist, and non-unique (there might be annotations for separate pages/anchors for the same PDF and thus same DOI; DOIs don't have any equivalent of `#page=n` I am aware of unless the DOI creator chose to mint such DOIs, which they never (?) do). DOIs sometimes use hyphens and so are subject to the usual problems of em/en-dashes sneaking in by 'smart' systems screwing up.
             -- - tags are optional, but all tags should exist on-disk as a directory of the form "doc/$TAG/"
             -- - annotations must exist and be unique inside me/full.gtx (overlap in auto.gtx can be caused by the hacky appending); their HTML should pass some simple syntactic validity checks
             -- - the key-value list can be empty, but any entries must have non-empty string keys & values
             -- CHECK FULL-ONLY
             let urlsC = map fst full
             let normalizedUrlsC = map (delete "https://" . delete "http://") urlsC
             when (length (nubOrd (sort normalizedUrlsC)) /=  length normalizedUrlsC) $ error $ "me/full.gtx: Duplicate URLs! " ++ unlines (normalizedUrlsC \\ nubOrd normalizedUrlsC)

             let tagsAllC = nubOrd $ concatMap (\(_,(_,_,_,_,_,ts,_)) -> ts) full

             -- mandatory field check (includes checking for empty annotation string):
             let emptyCheck = filter (\(u,(t,a,_,_,_,_,s)) ->  "" `elem` [u,t,a,s]) full
             unless (null emptyCheck) $ error $ "me/full.gtx: Link Annotation Error: empty mandatory fields! [URL/title/author/abstract] This should never happen: " ++ show emptyCheck

             duplicateAbstracts full

             -- CHECK HALF-ONLY
             -- intermediate link annotations: not finished, like 'me/full.gtx' entries, but also not fully auto-generated.
             -- This is currently intended for storing entries for links which I give tags (probably as part of creating a new tag & rounding up all hits), but which are not fully-annotated; I don't want to delete the tag metadata, because it can't be rebuilt, but such half annotations can't be put into 'me/full.gtx' without destroying all of the checks' validity.
             half <- readGTXSlow "metadata/half.gtx"
             let (fullPaths,halfPaths) = (map fst full, map fst half)
             let redundantHalfs = fullPaths `intersect` halfPaths
             unless (null redundantHalfs) (printRed "Redundant entries in half.gtx & me/full.gtx: " >> printGreen (show redundantHalfs))
             duplicateAbstracts (filter (\(p,_) -> not (isLocal (T.pack p))) half) -- filter out local paths, because there are annoying reasons (for now) why annotations of essays/essay sections may be redundant. TODO: fix that.

             let urlsCP = map fst (full ++ half)
             let files = map (takeWhile (/='#') . tail) $ filter (\u -> head u == '/') urlsCP

             let ensureExtension f = if '.' `elem` f then f else f ++ ".md"
             let checkFile f = fmap not $ doesFileExist $ ensureExtension f
             fileChecks <- Par.mapM checkFile files
             let missingFiles = map fst $ filter snd $ zip files fileChecks
             let printError f = printRed ("Full+half annotation error: file does not exist? " ++ f ++ " (checked file name: " ++ (ensureExtension f) ++ ")")
             mapM_ printError missingFiles

             -- auto-generated cached definitions; can be deleted if gone stale
             rewriteLinkMetadata half full "metadata/auto.gtx" -- do auto-cleanup first
             auto <- readGTXSlow "metadata/auto.gtx"
             duplicateAbstracts (filter (\(p,_) -> not (isLocal (T.pack p))) auto)

             -- merge the hand-written & auto-generated link annotations, and return:
             let final = M.union (M.fromList full) $ M.union (M.fromList half) (M.fromList auto) -- left-biased, so 'full' overrides 'half' overrides 'auto'
             let finalL = M.toList final

             -- CHECK ALL (FULL+HALF+AUTO):
             let badDoisDash = filter (\(_,(_,_,_,_,kvs,_,_)) -> anyInfix (kvDOI kvs) C.badDOISubstrings) finalL
             unless (null badDoisDash) $ error $ "GTXes: Bad DOIs (invalid punctuation in DOI): " ++ show badDoisDash
             -- about the only requirement for DOIs, aside from being made of graphical Unicode characters (which includes spaces <https://www.compart.com/en/unicode/category/Zs>!), is that they contain one '/':
             -- <https://www.doi.org/doi_handbook/2_Numbering.html#2.2.3> "The DOI syntax shall be made up of a DOI prefix and a DOI suffix separated by a forward slash. There is no defined limit on the length of the DOI name, or of the DOI prefix or DOI suffix. The DOI name is case-insensitive and can incorporate any printable characters from the legal graphic characters of Unicode." <https://www.doi.org/doi_handbook/2_Numbering.html#2.2.1>
             -- Thus far, I have not run into any real DOIs which omit numbers, so we'll include that as a check for accidental tags inserted into the DOI field.
             -- One of the most common errors with DOIs is swapping them for another metadata field like date or tag, so we can check for those. (DOIs may be confused with authors but authors never have a '/' in them and so checking for those handles that case.)
             let badDois = filter (\(_,(_,_,_,_,kvs,_,_)) -> let doi = kvDOI kvs in if (doi == "") then False else doi `elem` tagsAllC || head doi `elem` ['a'..'z'] || '/' `notElem` doi || null ("0123456789" `intersect` doi) || "https" `isPrefixOf` doi || isDate doi) finalL
             unless (null badDois) $ error $ "GTXes: Invalid DOI (missing mandatory forward slash or a number): " ++ show badDois

             -- NOTE: titles can validly begin/end with a forward-slash if they are, say, a subreddit. Titles also often end in colons for Twitter, where it is about an implied attached screenshot or image or retweet. And '+' ends a number of technology-related titles ('Google+', 'DC++', 'C++').
             let badTitles = filter (\(_,(t,_,_,_,_,_,_)) -> t /= "" && (last t `elem` C.badTitleTrailingChars || head t `elem` C.badTitleLeadingChars)) finalL
             unless (null badTitles) $ error $ "GTXes: Link Annotation Error: mangled title? Begins/ends in a strange character that should probably never happen a well-formed title: " ++ show badTitles

             let badKVs = filter (\(_,(_,_,_,_,kvs,_,_)) -> any (\(k,v) -> k `notElem` CM.gtxKeyValueKeyNames || k == "" || v == "") kvs) finalL
             unless (null badKVs) $ error $ "GTX: bad key-values in annotations, with unknown keys (not in the whitelist `Config.Misc.gtxKeyValueKeyNames`), or null keys/values: " ++ show badKVs

             let urlsFinal = M.keys final
             let brokenUrlsFinal = filter (\u -> null u ||
                                            not (head u == 'h' || head u == '/' || anyPrefix u C.allowedNonHttpURLPrefixes) ||
                                            (head u == '/' && "//" `isInfixOf` u) ||
                                            ' ' `elem` u ||
                                            ('—' `elem` u) || -- EM DASH
                                             -- empty anchors are meaningless, and imply a malformed URL where an anchor was intended but got lost; similarly for other common trailing typos, unless it is a local page where we're doing fancy range includes:
                                            (head u /= '/' && last u == '#') ||
                                            last u `elem` ['?', '&']
                                          )
                                   urlsFinal
             unless (null brokenUrlsFinal) $ error $ "GTX: Broken URLs: " ++ show brokenUrlsFinal

             let balancedQuotes = filter (\(_,(_,_,_,_,_,_,abst)) -> let count = length $ filter (=='"') abst in
                                             count > 0 && (count `mod` 2 == 1) ) finalL
             unless (null balancedQuotes) $ error $ "GTX: Link Annotation Error: unbalanced double quotes! " ++ show balancedQuotes

             let balancedBrackets = map (\(p,(title',_,_,_,_,_,abst) ) -> (p, balanced title', balanced abst)) $
                                     filter (\(_,(title,_,_,_,_,_,abst)) -> not $ null (balanced title ++ balanced abst)) finalL
             unless (null balancedBrackets) $ do printRed "GTX: Link Annotation Error: unbalanced brackets!"
                                                 printGreen $ ppShow balancedBrackets

             -- check validity of all external links:
             let urlsAll = filter (\(x,_) -> if x == "" then error "LinkMetadata.urlsAll: empty URL!" else
                                               let u = head x in
                                               if u `elem` ['/', '!'] || isInflationURL (T.pack [u]) ||
                                                      anyInfix x C.uriValidationExemptInfixes || not (anyPrefix x C.ignoredMalformedURLPrefixes) then False
                                                 else not (isURIReference x)) finalL
             unless (null urlsAll) $ printRed "Invalid URIs?" >> printGreen (ppShow urlsAll)

             -- look for duplicates due to missing affiliation:
             let urlsDuplicateAffiliation = findDuplicatesURLsByAffiliation final
             unless (null urlsDuplicateAffiliation) $ printRed "Duplicated URLs by affiliation:" >> printGreen (show urlsDuplicateAffiliation)

             -- TODO: there are way too many duplicate titles to deal with right now:
             -- let titlesSimilar = sort $ map (\(u,(t,_,_,_,_,_,_)) -> (u, map toLower t)) $ filter (\(u,_) -> '.' `elem` u && not ("wikipedia.org" `isInfixOf` u)) $ M.toList final
             -- let titles = filter (\title -> length title > 10) $ map snd titlesSimilar
             -- unless (length (nubOrd titles) == length titles) $ printRed  "Duplicate titles in GTXs!: " >> printGreen (show (sort (titles \\ nubOrd titles)))

             let authors = map (\(_,(_,aut,_,_,_,_,_)) -> aut) finalL
             mapM_ (\a -> unless (null a) $ when ((isDate a || isNumber (head a) || isPunctuation (head a)) && not (M.member (T.pack a) CA.authorLinkDB || a `elem` CA.authorWhitelist))
                                                  (printRed "Mixed up author & date?: " >> printGreen a) ) authors
             let authorsBadChars = nubOrd $ filter (\a -> a `notElem` CA.authorWhitelist &&
                                                 (anyInfix a C.badAuthorSubstrings || (last a /= '.' && isPunctuation (last a)))) $ filter (not . null) authors
             unless (null authorsBadChars) (printRed "Mangled author list?" >> printGreen (ppShow authorsBadChars))

             let yearLimit = show (CM.currentYear + C.futureYearSlack) -- no entry should be published or created 2+ years in the future!
             let datesBad = filter (\(_,(_,_,dt,dc,_,_,_)) -> (not (isDate dt || null dt)) || (not (isDate dc || null dc)) ||
                                                              (dt /= "" && (let y = take 4 dt
                                                                                y' = take 4 dc
                                                                            in y > yearLimit || y' > yearLimit))
                                   ) finalL
             unless (null datesBad) (printRed "Malformed date (not 'YYYY[-MM[-DD]]'): " >> printGreen (show datesBad))

             -- 'filterMeta' may delete some titles which are good; if any annotation has a long abstract, all data sources *should* have provided a valid title. Enforce that.
             let titlesEmpty = M.filter (\(t,_,_,_,_,_,abst) -> t=="" && length abst > C.missingTitleAbstractMinLength) final
             unless (null titlesEmpty) $ error ("Link Annotation Error: missing title despite abstract!" ++ show titlesEmpty)

             let tagIsNarrowerThanFilename = M.map (\(title,_,_,_,_,tags,_) -> (title,tags)) $ M.filterWithKey (\f (_,_,_,_,_,tags,_) -> if not ("/doc/" `isPrefixOf` f) then False else
                                                        let fileTag = delete "/doc/" $ takeDirectory f
                                                         in any ((fileTag++"/") `isPrefixOf`) tags) final
             unless (null tagIsNarrowerThanFilename) (printRed "Files whose tags are more specific than their path: " >> printGreen (unlines $ map (\(f',(t',tag')) -> t' ++ " : " ++ f' ++ " " ++ unwords tag') $ M.toList tagIsNarrowerThanFilename))

             -- check tags (not just full but all of them, including half.gtx)
             let tagsSet = sort $ nubOrd $ concat $ M.elems $ M.map (\(_,_,_,_,_,tags,_) -> tags) $ M.filter (\(t,_,_,_,_,_,_) -> t /= "") final
             tagsAll <- listTagsAll
             let tagsBad = tagsSet \\ tagsAll
             let annotationsWithBadTags = M.filter (\(_,_,_,_,_,ts,_) -> hasAny ts tagsBad) final
             unless (null annotationsWithBadTags) $ error ("Link Annotation Error: tag does not match a directory! Bad annotations: " ++ show annotationsWithBadTags)

             -- these are good ideas but will have to wait for embedding-based refactoring to be usable warnings.
             -- let tagsOverused = filter (\(c,_) -> c > tagMax) $ tagCount final
             -- unless (null tagsOverused) $ printRed "Overused tags: " >> printGreen (show tagsOverused)

             -- let tagPairsOverused = filter (\(c,_) -> c > tagPairMax) $ tagPairsCount final
             -- unless (null tagPairsOverused) $ printRed "Overused pairs of tags: " >> printGreen (show tagPairsOverused)

             -- 'See Also' links in annotations get put in multi-columns due to their typical length, but if I cut them down to 1–2 items, the default columns will look bad. `preprocessMarkdown.hs` can't do a length check because it has no idea how I will edit the list of similar-links down, so I can't remove the .columns class *there*; only way to do it is check finished annotations for having .columns set but also too few similar-links:
             let badSeeAlsoColumnsUse = M.keys $ M.filterWithKey (\_ (_,_,_,_,_,_,abst) -> let count = length (Data.List.HT.search "data-embeddingdistance" abst) in (count == 1 || count == 2) && "<div class=\"columns\">" `isInfixOf` abst ) final
             unless (null badSeeAlsoColumnsUse) (printRed "Remove columns from skimpy See-Also annotations: " >> printGreen (show badSeeAlsoColumnsUse))

             let manualIDs = M.keys $ M.filterWithKey (\_ (_,_,_,_,misc,_,_) -> let i = kvLookup "id" misc in i/="" && head i == '_') final
             unless (null manualIDs) (printRed $ "Manual IDs start with an underscore, which is forbidden. Only hash-IDs are allowed to start with an underscore! Bad entries: " ++ show manualIDs)

             -- ensure that link IDs are unique, and report ambiguous ones for fixing:
             let disambigs = LinkID.getDisambiguatedPairs final
             unless (null disambigs) (printRed "Link ID overrides: " >> print disambigs)

             return final

duplicateAbstracts :: MetadataList -> IO ()
duplicateAbstracts mdl = do
             let annotations = map (\(_,(_,_,_,_,_,_,s)) -> s) mdl
             -- Count occurrences of each non-empty annotation string
             -- We filter out "" because duplicate empty strings are caught by the 'emptyCheck' below
             -- and aren't the target of *this* specific check.
             let annotationCounts = (M.fromListWith (+) $ map (\s -> (s, 1)) $ filter (not . null) annotations) :: M.Map String Int
             -- Find which non-empty annotation strings are actually duplicated (appear > 1 times)
             let duplicatedAnnotationStrings = M.keysSet $ M.filter (> 1) annotationCounts
             unless (Set.null duplicatedAnnotationStrings) $ do
                 -- Find all original entries in mdl that use one of the duplicated strings
                 let offendingEntries = filter (\(_,(_,_,_,_,_,_,s)) -> s `Set.member` duplicatedAnnotationStrings) mdl
                 -- Extract the paths (URLs/keys) of these entries
                 -- Use nubOrd to list each problematic path only once, and sort them for clarity.
                 let offendingPaths = nubOrd $ map fst offendingEntries
                 let errorMsg = "Duplicate non-empty annotations found. The following " ++ show (length offendingPaths) ++ " paths share annotation content with at least one other path:\n" ++
                                unlines (map ("  - " ++) offendingPaths)
                 error errorMsg

writeAnnotationFragments :: ArchiveMetadata -> Metadata -> SizeDB -> Bool -> IO ()
writeAnnotationFragments am md sizes writeOnlyMissing =
  do let ml = M.toList md
     -- first pass: process all possible partials, so they are written out & on-disk for the` getAnnotationLinkCheck` in `addHasAnnotation`
     mapM_ (uncurry $ writeAnnotationFragment am md sizes writeOnlyMissing) $ filter (\(_,(_,_,_,_,_,_,abst)) -> length abst <= CM.minimumAnnotationLength) ml
     -- second pass: process all possible annotations. (This is awkward but without building in a whole dependency system or a global database or keeping the per-annotation processing, it's hard to see how to ensure no race condition with the annotation checking.)
     mapM_ (uncurry $ writeAnnotationFragment am md sizes writeOnlyMissing) ml
writeAnnotationFragment :: ArchiveMetadata -> Metadata -> SizeDB -> Bool -> Path -> MetadataItem -> IO ()
writeAnnotationFragment _ _ sdb _ u ("","","",_,[],[],"")
  | M.lookup (linkCanonicalize u) sdb == Nothing = return ()
writeAnnotationFragment am md sdb onlyMissing u i@(a,b,c,dc,kvs,ts,abst) =
      if (("/index#" `isInfixOf` u && "/index#abstract" /= u) && ("#section" `isInfixOf` u || "-section" `isSuffixOf` u)) ||
         anyInfix u ["/index#see-also", "/index#links", "/index#miscellaneous"] then return ()
      else do let u' = linkCanonicalize u
              let (filepath',_) = getAnnotationLink u'
              annotationExisted <- doesFileExist filepath'
              when (not onlyMissing || (onlyMissing && not annotationExisted)) $ do

                  (_,bl) <- getBackLinkCheck u'
                  (_,sl) <- getSimilarLinkCheck u'
                  (_,lb) <- getLinkBibLinkCheck u'
                  -- we prefer annotations which have a fully-written abstract, but we will settle for 'partial' annotations,
                  -- which serve as a sort of souped-up tooltip: partials don't get the dotted-underline indicating a full annotation, but it will still pop-up on hover.
                  -- Now, tooltips already handle title/author/date, so we only need partials in the case of things with tags, abstracts, backlinks, or similar-links, which cannot be handled by tooltips (since HTML tooltips only let you pop up some raw unstyled Unicode text, not clickable links).


                  -- if we do not have a 'full' abstract, we have a miscellaneous set of metadata, none of which are all *that* important on their own, but which together can be worth showing to the reader as a 'partial' annotation.
                  -- How do we decide how much miscellaneous metadata is enough? it is currently rather ad hoc. Currently, we treat each one as a kind of binary threshold, and if any are True, the partial status is true
                  blN    <- getBackLinkCount u'
                  slN    <- getSimilarLinkCount u'
                  let filesize = filesizeSpan sdb u'
                  let partialScoring = 0 < sum [length (drop C.partialAnnotationIgnoredTagCount ts),
                                                 length abst,
                                                 if null filesize then 0 else 1,
                                                 if blN > C.partialAnnotationBacklinkThreshold then 1 else 0,
                                                 if slN > C.partialAnnotationSimilarThreshold then 1 else 0]

                  when partialScoring $ do
                      let titleHtml    = nominalToRealInflationAdjusterHTML c $ typesetHtmlField $ titlecase' a
                      let authorHtml   = typesetHtmlField b
                      -- obviously no point in trying to reformatting date/DOI, so skip those
                      let abstractHtml = typesetHtmlField abst
                      -- TODO: this is fairly redundant with 'pandocTransform' in hakyll.hs; but how to fix without circular dependencies…
                      let pandoc = Pandoc nullMeta $ generateAnnotationBlock md am sdb (u', Just (titleHtml,authorHtml,c,dc,kvs,ts,abstractHtml)) bl sl lb
                      unless (null abst) $ void $ createAnnotations md pandoc
                      pandoc' <- do let p = walk (hasAnnotation md) $
                                            walk (linkIcon . linkLive . nominalToRealInflationAdjuster) $
                                                  convertInterwikiLinks $
                                                  walk addPageLinkWalk $
                                                  parseRawAllClean pandoc
                                    walkM (imageLinkHeightWidthSet <=< addCanPrefetch <=< localizeLink am) p
                      let finalHTMLEither = runPure $ writeHtml5String safeHtmlWriterOptions pandoc'

                      when (length (urlEncode u') > C.annotationURLWarningLength) (printRed "Warning, annotation fragment path → URL truncated!" >>
                                                          putStrLn ("Was: " ++ urlEncode u' ++ " but truncated to: " ++ take C.annotationURLPreviewLength u' ++ "; (check that the truncated file name is still unique, otherwise some popups will be wrong)"))

                      case finalHTMLEither of
                        Left er -> error ("Writing annotation fragment failed! " ++ show u ++ " : " ++ show i ++ " : " ++ show er)
                        Right finalHTML -> do finalHTML' <- fmap T.pack $ addImgDimensions $ T.unpack finalHTML -- try to add image height=/width= attributes to `<img>` elements for faster rendering for annotations
                                              writeUpdatedFile "annotation" filepath' finalHTML'
             -- HACK: the current hakyll.hs assumes that all annotations already exist before compilation begins, although we actually dynamically write as we go.
             -- This leads to an annoying behavior where a new annotation will not get synced in its first build, because Hakyll doesn't "know" about it and won't copy it into the _site/ compiled version, and it won't get rsynced up. This causes unnecessary errors.
             -- There is presumably some way for Hakyll to do the metadata file listing *after* compilation is finished, but it's easier to hack around here by forcing 'new' annotation writes to be manually inserted into _site/.
                                              unless annotationExisted $ writeUpdatedFile "annotation" ("./_site/"++filepath') finalHTML

-- walk each page, extract the links, and create annotations as necessary for new links
createAnnotations :: Metadata -> Pandoc -> IO ()
createAnnotations md (Pandoc _ markdown) = Par.mapM_ (annotateLink md) $ extractLinksInlines (Pandoc nullMeta markdown)

annotateLink :: Metadata -> Inline -> IO (Either Failure (Path, MetadataItem))
annotateLink md x@(Link (_,_,_) _ (targetT,_))
  | let targetT' = T.replace "https://gwern.net/" "/" targetT in
      anyPrefixT targetT' ["/metadata/", "/doc/www/", "/ref/", "/blog/", "#", "!", "\8383", "$"] = return (Left Permanent) -- annotation intermediate files, self-links, interwiki links, and inflation-adjusted currencies *never* have annotations. And "/blog/" links are always generated *from* annotations and thus don't need to be checked.
  | otherwise =
  do let target = T.unpack targetT
     when (null target) $ error (show x)
     when ((reverse $ take 3 $ reverse target) == "%20" || last target == ' ') $ error $ "URL ends in space? " ++ target ++ " (" ++ show x ++ ")"
     -- normalize: convert 'https://gwern.net/doc/foo.pdf' to '/doc/foo.pdf' and './doc/foo.pdf' to '/doc/foo.pdf'
     -- the leading '/' indicates this is a local Gwern.net file
     let target' = replace "https://gwern.net/" "/" target
     let target'' = if head target' == '.' then drop 1 target' else target'

     -- check local link validity: every local link except tags should exist on-disk:
     when (head target'' == '/' && not ("/metadata/annotation/" `isPrefixOf` target'')) $
       do isDirectory <- doesDirectoryExist (tail target'')
          when isDirectory $ error ("Attempted to annotate a directory, which is not allowed (links must be to files or $DIRECTORY/index): " ++ target' ++ " : " ++ target ++ " (" ++ show x ++ ")")
          let target''' = (\f -> if '.' `notElem` f then f ++ ".md" else f) $ takeWhile (/='#') $ tail target''

          unless (takeFileName target''' == "index" || takeFileName target''' == "index.md") $
             do exist <- doesFileExist target'''
                unless exist $ printRed ("Link error in 'LM.annotateLink': file does not exist? " ++ target''' ++ " (" ++target++")" ++ " (" ++ show x ++ ")")

     let annotated = M.lookup target'' md
     today <- CM.todayDayString
     case annotated of
       -- the link has a valid annotation already defined, so we're done: nothing changed.
       Just i  -> return (Right (target'', i))
       Nothing -> do new <- linkDispatcher md x
                     case new of
                       -- some failures we don't want to cache because they may succeed when checked differently or later on or should be fixed:
                       Left Temporary -> return (Left Temporary)
                       -- cache the failures too, so we don't waste time rechecking the PDFs every build; return False because we didn't come up with any new useful annotations:
                       Left Permanent -> appendLinkMetadata target'' ("", "", "", today, [], [], "") >> return (Left Permanent)
                       Right y@(f,m) -> do
                                       printGreen (f ++ "; GTX:\n" ++ T.unpack (T.unlines (GTX.untupleize today y)) ++ "\nHaskell: " ++ show y)
                                       -- return true because we *did* change the database & need to rebuild:
                                       appendLinkMetadata f m >> return (Right y)
annotateLink _ x = error ("LM.annotateLink was passed an Inline which was not a Link: " ++ show x)

-- walk the page, and modify each URL to specify if it has an annotation available or not, and add its link ID:
-- WARNING: all pseudo-URLs like interwikis ('!W') or inflation-adjustments ('$2025') must be gone at this point, and converted to proper URLs. Assigning link IDs based on a link like '!W' is useless and collides all WP links.
hasAnnotation :: Metadata -> Block -> Block
hasAnnotation md = walk (hasAnnotationOrIDInline md)

hasAnnotationOrIDInline :: Metadata -> Inline -> Inline
hasAnnotationOrIDInline md inline = case inline of
    link@(Link  (_, classes, _) _ (url, _)) -> process classes url link
    img@(Image (_, classes, _) _ (url, _))  -> process classes url img
    _ -> inline
  where
    hasAnyAnnotatedClass :: [T.Text] -> Bool
    hasAnyAnnotatedClass = hasAny C.annotationClasses

    process :: [T.Text] -> T.Text -> Inline -> Inline
    process classes url x
      | hasAnyAnnotatedClass classes = x
      | otherwise = processLink md url x

    processLink :: Metadata -> T.Text -> Inline -> Inline
    processLink metadatadb url x =
        let canonicalUrl = linkCanonicalize $ T.unpack url
         -- NOTE: we do not implement any blacklists or exclusion here, but defer it to the Metadata database, which will or will not have a Nothing vs Just entry; so all that logic is handled by `linkDispatcherURL` creating them in the first place.
        in if "/blog/" `isPrefixOf` canonicalUrl then x else
             case M.lookup canonicalUrl metadatadb of
               Nothing                     -> addID metadatadb Nothing x
               Just ("","","","",[],[],"") -> addID metadatadb Nothing x
               Just metadataItem           -> addID metadatadb (Just metadataItem)
                                               (addRecentlyChanged metadataItem $ addHasAnnotation metadataItem x)
addID :: Metadata -> Maybe MetadataItem -> Inline -> Inline
addID md maybeMetadataItem inline = case inline of
    (Link x@(anchor, classes, _) e (url, title)) ->
        if anchor == "" && "id-not" `notElem` classes
            then Link (generateLinkID x maybeMetadataItem url) e (url, title)
            else inline
    (Image x@(anchor, classes, _) e (url, title)) ->
        if anchor == "" && "id-not" `notElem` classes
            then Image (generateLinkID x maybeMetadataItem url) e (url, title)
            else inline
    _ -> handleInvalidAddIDCall maybeMetadataItem inline
 where
        generateLinkID :: (T.Text, [T.Text], [(T.Text, T.Text)]) -> Maybe MetadataItem -> T.Text -> (T.Text, [T.Text], [(T.Text, T.Text)])
        generateLinkID ("", classs, kvs) maybeMetadataItem' url = case maybeMetadataItem' of
            Nothing                            -> (generateID md (T.unpack url) "" "",       classs, kvs)
            Just (_, author, date, _, _, _, _) -> (generateID md (T.unpack url) author date, classs, kvs)
        -- if it has an ID already, avoid overriding?
        generateLinkID a _ _ = a

        handleInvalidAddIDCall :: Maybe MetadataItem -> Inline -> a
        handleInvalidAddIDCall maybeMetadataItemBad inlineBad = error $
            "LinkMetadata.hs: addID: called with " ++
            show maybeMetadataItemBad ++
            " annotation and a non-Link Inline element:" ++
            show inlineBad ++
            "; This should never happen."

-- the prefetch JS defaults to assuming all URLs can be prefetched; however, for local large-files or files that can't be viewed in-browser and so there's little point in prefetching them, we want to explicitly disable them by adding a '.prefetch-not' class:
addCanPrefetch :: Inline -> IO Inline
addCanPrefetch x@(Link (_,classes,_) _ (f,_))
 | "prefetch" `elem` classes || "prefetch-not" `elem` classes = return x
 | not (isLocal f) = return x
 | "/metadata/" `T.isPrefixOf` f = return x
 | isLocal f && not (isFileViewable f') = return $ addClass "prefetch-not" x
 | otherwise = do size <- getFileSize $ takeWhile (/='#') $ tail f'
                  if size < C.maxPrefetchBytes then return x else
                    -- >10MB seems a bit too much to spend speculatively, even these days
                    return $ addClass "prefetch-not" x
  where f' = T.unpack f
addCanPrefetch x = return x

-- addHasAnnotation :: MetadataItem -> Inline -> Inline
-- addHasAnnotation (title,aut,dt,_,miscMetadata,_,abstrct) x@(Link (a,b,c) e (f,g))
--   | wasAnnotated x = x'
--   -- WARNING: Twitter is currently handled in Config.LinkArchive, because whether a Twitter/Nitter URL is a valid 'annotation' depends on whether there is a Nitter snapshot hosted locally the JS can query. Many Nitter snapshots, sadly, fail, so it is *not* guaranteed that a Twitter URL will have a usable snapshot. TODO: when Twitter is merged into the backend, parsing the Nitter mirrors to create proper annotations, rather than using JS to parse them at runtime, this should be removed.
--   | length abstrct > CM.minimumAnnotationLength  = addClassPopupNot miscMetadata $ addClass "link-annotated" x' -- full annotation, no problem.
--    -- may be a partial…?
--   | not $ unsafePerformIO $ doesFileExist $ fst $ getAnnotationLink $ T.unpack f = x'
--   -- | unsafePerformIO $ do
--   --                         (filepath',_) <- getAnnotationLinkCheck $ T.unpack f
--   --                         return $ filepath' == ""
--   --     = x' -- no, a viable partial would have a (short) fragment written out, see `writeAnnotationFragment` logic; WARNING: race condition here - what if we process a full annotation, which links to a partial (eg. its author) *before* the partial has been written out? we will get a spurious 'no full or partial annotation' return... The current compromise is to try to process all URLs with short/empty annotations first (which might be partials) and then when the fragments should all be written out, rerun with the regular batch
--   | otherwise = addClassPopupNot miscMetadata $ addClass "link-annotated-partial" x'
--   where
--     g'
--       | g/="" = g
--       | title=="" && aut=="" = g
--       | title/="" && aut=="" = T.pack title
--       | title=="" && aut/="" = T.pack $ authorsToCite (T.unpack f) aut dt
--       | otherwise = T.pack $ "'" ++ title ++ "', " ++ authorsToCite (T.unpack f) aut dt
--     x' = Link (a,b,c) e (f,g')
-- addHasAnnotation _ z = z

addHasAnnotation :: MetadataItem -> Inline -> Inline
addHasAnnotation meta x@(Link  attr e target) = addHasAnnotationLinkLike meta "link"  Link  attr e target x
addHasAnnotation meta x@(Image attr e target) = addHasAnnotationLinkLike meta "image" Image attr e target x
addHasAnnotation _ z = z
addHasAnnotationLinkLike :: MetadataItem
                         -> T.Text -- prefix
                         -> (Attr -> [Inline] -> Target -> Inline)  -- constructor
                         -> Attr -> [Inline] -> Target
                         -> Inline  -- original for wasAnnotated check
                         -> Inline
addHasAnnotationLinkLike (title,aut,dt,_,miscMetadata,_,abstrct) prefix mkInline (a,b,c) e (f,g) x
  | wasAnnotated x = x'
  | length abstrct > CM.minimumAnnotationLength  = addClassPopupNot miscMetadata $ addClass (prefix`T.append`"-annotated") x'
  | not $ unsafePerformIO $ doesFileExist $ fst $ getAnnotationLink $ T.unpack f = x'
  | otherwise = addClassPopupNot miscMetadata $ addClass (prefix`T.append`"-annotated-partial") x'
  where
    g'
      | g/="" = g
      | title=="" && aut=="" = g
      | title/="" && aut=="" = T.pack title
      | title=="" && aut/="" = T.pack $ authorsToCite (T.unpack f) aut dt
      | otherwise = T.pack $ "'" ++ title ++ "', " ++ authorsToCite (T.unpack f) aut dt
    x' = mkInline (a,b,c) e (f,g')

-- sometimes we want to disable a popup on a link where the popup view would be bad for whatever reason (eg. spoilers hidden by reader-mode on the main page). The property to disable a popup is '.extract-not'. So we check if the page has .extract-not set in its metadata; specifically, its 'css-extension' key.
addClassPopupNot :: [(String,String)] -> Inline -> Inline
addClassPopupNot []           x = x
addClassPopupNot [("","")]    x = error $ "LM.addClassPopupNot: invalid metadata key-value dict passed in Only empty strings? Other input was: " ++ show x
addClassPopupNot miscMetadata x = case lookup "css-extension" miscMetadata of
                                    Just cssExtensions -> if not ("extract-not" `isInfixOf` cssExtensions) then x else addClass "extract-not" x
                                    Nothing        -> x

-- checks if a Link was recently modified & sets a '.link-modified-recently' class (with usual negation '.link-modified-recently-not') for CSS styling.
-- Exclusions: indexes/tag-directories, because they churn far too frequently (and contain intrinsically dated contents) to be worth highlighting to readers; Wikipedia links (too numerous).
addRecentlyChanged :: MetadataItem -> Inline -> Inline
addRecentlyChanged (_,_,_,"",       _,_,_) x                     = x
addRecentlyChanged (u,_,_,dtChanged,_,_,_) x@(Link _ _ (url,_))  =
  if dtChanged < CM.currentMonthAgo || hasClass "link-modified-recently-not" x || "/index" `T.isInfixOf` url || "wikipedia.org/wiki/" `isInfixOf` u then x
  else addClass "link-modified-recently" x
addRecentlyChanged (_,_,_,dtChanged,_,_,_) x@(Image _ _ (url,_)) =
  if dtChanged < CM.currentMonthAgo || hasClass "image-modified-recently-not" x || "/index" `T.isInfixOf` url then x
  else addClass "image-modified-recently" x
addRecentlyChanged _ x                                           = x

-- was this link given either a partial or full annotation?
wasAnnotated :: Inline -> Bool
wasAnnotated x@Link{}  = isAnnotatedInline x
wasAnnotated x@Image{} = isAnnotatedInline x
wasAnnotated x = error $ "LinkMetadata.wasAnnotated: tried to get annotation status of a non-Link/Image element, which makes no sense? " ++ show x
isAnnotatedInline :: Inline -> Bool
isAnnotatedInline x = any (`hasClass` x) C.positiveAnnotationClasses

generateAnnotationBlock :: Metadata -> ArchiveMetadata -> SizeDB -> (FilePath, Maybe MetadataItem) -> FilePath -> FilePath -> FilePath -> [Block]
generateAnnotationBlock md am sdb (f, ann) blp slp lb =
  case ann of
     Nothing                 -> nonAnnotatedLink
     -- Just ("",   _,_,_,_,_,_)  -> nonAnnotatedLink
     -- Just (_,    _,_,_,_,_,"") -> nonAnnotatedLink
     Just x@(tle,aut,dt,_,kvs,ts,abst) ->
       let tle' = if null tle then "<code>"++f++"</code>" else Typography.titleWrap tle
           lid  = generateID md f aut dt
           lid' = if lid/="" then T.pack "annotation-" `T.append` lid else
                     error ("LinkMetadata.generateAnnotationBlock: `generateID` failed to yield a non-empty ID on a link. Input was: " ++ show x)
           -- NOTE: we cannot link to an anchor fragment in ourselves, like just link in the annotation header to `#backlink-transclusion`, because it would severely complicate all the anchor-rewriting logic (how would it know if `#backlink-transclusion` refers to something *in* the annotation, or is a section or anchor inside the annotated URL?). But fortunately, by the logic of caching, it doesn't much matter if we link the same URL twice and pop it up the first time vs transclude it inside the popup/popover the second time.
           lidBacklinkFragmentID    = if lid=="" then "" else "backlink-transclusion-"    `T.append` lid
           lidSimilarLinkFragmentID = if lid=="" then "" else "similarlink-transclusion-" `T.append` lid
           lidLinkBibLinkFragmentID = if lid=="" then "" else "link-bibliography-transclusion-" `T.append` lid
           author = authorCollapse aut

           dt' = dateTruncateBad dt
           date = if dt'=="" || not (isDate dt) then [] else [Span ("", ["date", "cite-date"],
                                                 if length dt' > 4 then [("title",T.pack dt')] else []) -- don't set a redundant title
                                           [Str (T.pack $ take 4 dt')]]
           tags = if ts==[] then [] else [tagsToLinksSpan $ map T.pack ts]
           backlink = if blp=="" then [] else (if tags==[] then [] else [Str ";", Space]) ++  [Span ("", ["backlinks"], []) [Link ("",["aux-links", "link-page", "id-not", "backlinks"],[]) [Str "backlinks"] (T.pack blp, "Reverse citations for this page.")]]
           similarlinkPadding = if (blp=="" && tags==[]) || lb=="" then [] else [Str ";", Space]
           similarlink =  if slp=="" then [] else
                            similarlinkPadding++[Span ("", ["similars"], []) ([Link ("",["aux-links", "link-page", "id-not", "similars"],[]) [Str "similar"] (T.pack slp, "Similar links for this link (by text embedding).")])]
           linkBibliography = if lb=="" then [] else (if blp=="" && slp=="" && tags==[] then []
                                                       else [Str ";", Space]) ++ [Span ("", ["link-bibliography"], [])
                                                                                   [Link ("",["aux-links", "link-page", "id-not", "link-bibliography"],[]) [Str "bibliography"] (T.pack lb, "Link-bibliography for this annotation (list of references/sources/links it cites).")]]
           doi = kvDOI kvs
           values = if doi=="" then [] else [("doi",T.pack $ processDOI doi)]
           link = addRecentlyChanged x $ linkLive $ unsafePerformIO $ localizeLink am $ -- HACK: force archiving & link-living because it is not firing reliably (particularly on Twitter partials); another Raw HTML issue? it's suspicious that we have that RawInline right there… which might disable walks?
             -- NOTE: 'data-id-ref' is no longer used to provide '/ref/$ID' URLs in popup title-bars due to confusion and bad mobile UX, and currently has no function. I do still want to expose the /ref/ functionality to the reader *somehow*, just don't yet know how...
             Link (lid', [if null abst then "link-annotated-partial" else "link-annotated"], ("id-ref",lid):values) [RawInline (Format "html") (T.pack tle')] (T.pack f,"")
           -- make sure every abstract is wrapped in paragraph tags for proper rendering:
           abst' = if null abst || anyPrefix abst ["<p>", "<ul", "<ol", "<h2", "<h3", "<bl", "<figure", "<div"] then abst else "<p>" ++ abst ++ "</p>"
       in
         map Para
          ([[link]
         , author
         , date
         , tags] ++
         filesizeLines ++
          [backlink
         , similarlink
         , linkBibliography]) ++
         (if null abst then []
                  else [BlockQuote [RawBlock (Format "html") (rewriteAnchors f (T.pack abst') `T.append`
                                                   if (blp++slp++lb)=="" then ""
                                                   else
                                                        ((if blp=="" then "" else ("<div class=\"backlinks-append aux-links-append collapse\"" `T.append` " id=\"" `T.append` lidBacklinkFragmentID `T.append` "\" " `T.append` ">\n<p><a class=\"id-not include-even-when-collapsed\" href=\"" `T.append` T.pack blp `T.append` "\"><strong>Backlinks</strong>:</a></p>\n</div>")) `T.append`
                                                         (if slp=="" then "" else ("<div class=\"similars-append aux-links-append collapse\"" `T.append` " id=\"" `T.append` lidSimilarLinkFragmentID `T.append` "\" " `T.append` ">\n<p><a class=\"id-not include-even-when-collapsed\" href=\"" `T.append` T.pack slp `T.append` "\"><strong>Similar Links:</strong></a></p>\n</div>")) `T.append`
                                                          (if lb=="" then "" else ("<div class=\"link-bibliography-append aux-links-append collapse\"" `T.append` " id=\"" `T.append` lidLinkBibLinkFragmentID `T.append` "\" " `T.append` ">\n<p><a class=\"id-not include-even-when-collapsed\" href=\"" `T.append` T.pack lb `T.append` "\"><strong>Bibliography:</strong></a></p>\n</div>")))
                                                              )]
                       ]) ++
                generateFileTransclusionBlock am (f, x)
    where
      filesize = filesizeSpan sdb f
      filesizeLines = if null filesize then [] else [filesize]

      nonAnnotatedLink :: [Block]
      nonAnnotatedLink = [Para [Link nullAttr [Str (T.pack f)] (T.pack f, "")]] ++
                         map Para filesizeLines ++
                         generateFileTransclusionBlock am (f, ("",undefined,undefined,undefined,undefined,undefined,undefined))
-- generate an 'annotation block' except we leave the actual heavy-lifting of 'generating the annotation' to transclude.js, which will pull the popups annotation instead dynamically/lazily at runtime. As such, this is a simplified version of `generateAnnotationBlock`.
generateAnnotationTransclusionBlock :: ArchiveMetadata -> (FilePath, MetadataItem) -> [Block]
generateAnnotationTransclusionBlock am (f, x@(tle,_,_,_,_,_,_)) =
                                let tle' = if null tle then "<code>"++f++"</code>" else tle
                                    -- NOTE: we set this on special-case links like Twitter links anyway, even if they technically do not have 'an annotation'; the JS will handle `.include-annotation` correctly anyway
                                    link = linkIcon $ addRecentlyChanged x $ addHasAnnotation x $ Link ("", ["id-not", "include-annotation"], [])
                                      [RawInline (Format "html") (T.pack tle')] (T.pack f,"")

                                    fileTransclude = if wasAnnotated link then [] else generateFileTransclusionBlock am (f, ("",undefined,undefined,undefined,undefined,undefined,undefined))
                                    linkColon = if wasAnnotated link || null fileTransclude then [] else
                                                  if ".pdf" `isInfixOf` f && not (wasAnnotated link) then [Span ("",["mobile-not"],[]) [Str " \8202:"]] -- if there is nothing like a bl/sl/abstract and it's a PDF, then the transcluded PDF will be invisible on mobile, and we will see a weird section like 'PDF title:\n[blank space]', because the PDF transclude will be hidden as useless on mobile but the colon will still be implying it's there
                                                  else [] -- TODO: might be causing link chomping now? -- [Str " \8202:"] -- HAIR SPACE to avoid link chomping
                                in Para [Strong (link:linkColon)] : fileTransclude

-- transclude a *file* (or possibly a URL) directly, if possible. For example, an image will be displayed by `generateAnnotationTransclusionBlock` as a normal list item with its name & metadata as text, but then the image itself will be displayed immediately following it. `generateFileTransclusionBlock` handles the logic of transcluding each supported file type, as each file will require a different approach. (Image files are supported directly by Pandoc, but video files require raw HTML to be generated, while CSV files must be rendered to HTML etc.)
--
-- Collapse behavior: media types are displayed by default everywhere (the user wants to see them immediately because it's easy to see an image etc., and performance-wise they are cheap, because they are either small like images or set to their equivalents of 'lazy loading' like video/audio); document types are collapsed by default everywhere (many users will have no interest and documents like PDFs or HTML can be almost arbitrarily large, like a HTML mirror of "The Forgotten Pixel Art Masterpieces of the PlayStation 1 Era" which due to the animations is fully 183MB!).
-- We want to display media (particularly images) by default, so tag-directories can serve as informal 'galleries'; many images will never be seen in pages/annotations, nor do I want to constantly update a 'gallery' page with every single minimally-interesting image, and images are highly suitable for browsing very rapidly through, so it is fine to display all images for scrolling through.
--
-- For a list of legal Gwern.net filetypes, see </lorem-link#file-type>.
-- Supported: documents/code (most, see `isDocumentPreviewable`/`isCodePreviewable`); images (all except PSD); audio (MP3); video (avi, MP4, WebM, YouTube, except SWF); archive/binary (none).
-- Views: for the purposes of popups & prefetches, we distinguish between a document being 'viewable' and 'previewable'. A 'viewable' document is itself, the raw original literal file, 'viewable' in-browser, like a PDF; a 'previewable' document may not be viewable (because that doesn't work at all, or it looks bad, or it is too large etc.), but there is some version of it which can be 'viewable' (eg. a HTML-rendered export of a spreadsheet). A key performance difference here is that a 'viewable' file, like a PDF, can be worth prefetching, because if the reader wants to read it, that saves time; however, for a 'previewable' file, that is *not* the case---prefetching the file doesn't prefetch the preview-version and so the reader still has to wait on that.
generateFileTransclusionBlock :: ArchiveMetadata -> (FilePath, MetadataItem) -> [Block]
generateFileTransclusionBlock _             x@("",                _) = error $ "LM.generateFileTransclusionBlock: called with no URL? " ++ show x
-- generateFileTransclusionBlock _ x@(_, ("","","","",[],[],"")) = error $ "LM.generateFileTransclusionBlock: called with a completely empty annotation? " ++ show x
generateFileTransclusionBlock am x@(f, (tle,_,_,_,_,_,_)) = if null generateFileTransclusionBlock' then [] else [Div ("", ["aux-links-transclude-file"], []) generateFileTransclusionBlock']
 where
   f'     = unsafePerformIO $ localizeLinkURL am f
   localP = isLocal $ T.pack f'
   liveP  = alreadyLive $ linkLiveString f' -- for web pages which are link-live capable, we wish to file-transclude them; this is handled by annotations as usual, but for annotation-less URLs we have the same problem as we do for annotation-less local-file media - #Miscellaneous tag-directories get shafted. So we check for link-live here and force a fallback for links which are live but annotation-less.
   fileSizeMB       = if not localP then 0 else
                        round (fromIntegral (unsafePerformIO $ getFileSize $ takeWhile (/='#') $ tail f') / (1000000::Double)) :: Int
   fileSizeMBString = if fileSizeMB < CM.minFileSizeWarning then "" else show fileSizeMB++"MB"
   fileTypeDescription = if C.youtubeWatchPrefix `isPrefixOf` f then "YouTube video"
                         else if C.twitterHostPrefix `isPrefixOf` f && C.twitterStatusInfix `isInfixOf` f then "Tweet"
                              else CM.fileExtensionToEnglish $ takeExtension f'
   fileTypeDescriptionString  | fileTypeDescription/="" = fileTypeDescription
                              | liveP && not localP     = "External Link"
                              | otherwise               = "page"
   fileDescription           = Str $ T.pack $
                                     fileTypeDescriptionString
                                  ++ (if null fileSizeMBString then "" else " ("++fileSizeMBString ++ ")")
   title        = if null tle then Code nullAttr (T.pack f') else RawInline (Format "HTML") $ T.pack tle
   titleCaption = [Strong [Str "View ", fileDescription], Str ":"]
   dataArguments = if C.wikipediaArticleInfix `isInfixOf` f' && isWPAPI (T.pack f') then [("include-template", "$annotationFileIncludeTemplate")] else [] -- use special template to exclude the duplicate title when we render WP articles via the API (which is irrelevant when it's just an iframe live-link); doesn't apply to Twitter transcludes yet, but if necessary, they can get a custom one too
   generateFileTransclusionBlock'
    | f' == "" = error $ "LinkMetadata.generateFileTransclusionBlock.generateFileTransclusionBlock': localized filepath (`f'`) was empty string; this should never happen! `(FilePath,MetadataItem)` input was: " ++ show x
    | isPagePath (T.pack f') = [] -- for essays, we skip the transclude block: transcluding an entire essay is a bad idea!
    | C.wikipediaArticleInfix `isInfixOf` f' || (C.twitterHostPrefix `isPrefixOf` f && C.twitterStatusInfix `isInfixOf` f) =
      [Para [Link ("",["id-not", "include-content"],dataArguments) [title] (T.pack f, "")]] -- NOTE: Twitter/Wikipedia special-case: we link the *original* Twitter URL, to get the JS transform of the local-archive (instead of displaying the local Nitter snapshot in an iframe as a regular web page)
    -- PDFs cannot be viewed on mobile due to poor mobile browser support + a lack of good PDF → HTML converter, so we have to hide that specifically for mobile.
    | isDocumentPreviewable f' || isCodePreviewable f' = [Div ("", "collapse":(if ".pdf" `isInfixOf` f' then ["mobile-not"] else []), [])
                                                      [Para titleCaption, Para [linkIcon $ Link ("", ["id-not", "link-annotated-not", "include-content", "include-lazy"], []) [title] (T.pack f', "")]]] -- TODO: do we need .link-annotated-not set on either of these links?
    -- image/video/audio:
    | isMediaViewable f' || C.youtubeWatchPrefix `isPrefixOf` f =
      -- multimedia can be annotated; if it is (has a title & author etc.), we don't need to display additional metadata, and we just display it immediately literally:
        [Para [Link ("",["link-annotated-not", "include-content", "width-full"],[]) [title] (T.pack f', "")]]
    | otherwise = if not liveP then [] else
        [Div ("",["collapse"],[])
          [Para titleCaption, Para [linkIcon $ Link ("", ["id-not", "link-annotated-not", "include-content", "include-lazy"], []) [title] (T.pack f', "")]]]

-- document types excluded: ebt, epub, mdb, mht, ttf, docs.google.com; cannot be viewed easily in-browser (yet?)
isDocumentPreviewable, isCodePreviewable, isMediaViewable, isFileViewable :: FilePath -> Bool
isDocumentPreviewable f = (isLocal $ T.pack f) &&
                       (hasExtensionS ".html" f ||
                        anyInfix f C.documentPreviewableExtensions || -- Pandoc syntax-highlighted or native-browser
                        hasHTMLSubstitute f) -- these are converted by LibreOffice to clean HTML versions for preview
-- local source files have syntax-highlighted versions we can load. (NOTE: we cannot transclude remote files which match these, because many URLs are not 'cool URIs' and casually include extensions like '.php' or '.js' while being HTML outputs thereof.)
isCodePreviewable     f = isLocal (T.pack f) && anySuffix f C.codePreviewableExtensions -- we exclude `/static/*/.html` since that's not possible

isMediaViewable f = Image.isImageFilename f || Image.isVideoFilename f || hasExtensionS ".mp3" f
isFileViewable f = isLocal (T.pack f) && (anySuffix f C.fileViewableExtensions || isMediaViewable f)

-- annotations, like </face>, often link to specific sections or anchors, like 'I clean the data with [Discriminator Ranking](#discriminator-ranking)'; when transcluded into other pages, these links are broken. But we don't want to rewrite the original abstract as `[Discriminator Ranking](/face#discriminator-ranking)` to make it absolute, because that screws with section-popups/link-icons! So instead, when we write out the body of each annotation inside the link bibliography, while we still know what the original URL was, we traverse it looking for any links starting with '#' and rewrite them to be absolute:
-- WARNING: because of the usual RawBlock/Inline(HTML) issues, reading with Pandoc doesn't help - it just results in RawInline elements which still need to be parsed somehow. I settled for a braindead string-rewrite; in annotations, there shouldn't be *too* many cases where the href=# pattern shows up without being a div link…
rewriteAnchors :: FilePath -> T.Text -> T.Text
rewriteAnchors f = T.pack . replace "href=\"#" ("href=\""++f++"#") . T.unpack

-- find all instances where I link "https://arxiv.org/abs/1410.5401" when it should be "https://arxiv.org/abs/1410.5401#deepmind", where they are inconsistent and the hash matches a whitelist of orgs.
findDuplicatesURLsByAffiliation :: Metadata -> [(String, [String])]
findDuplicatesURLsByAffiliation md = let urls  = nubOrd . filter ('.' `elem`) $ map (\(u,_) -> u) $ M.toList md
                                         urlDB = M.fromListWith (++) $ map (\u -> (takeWhile (/= '#') u, [u])) urls
                                         affiliationURLPatterns = (map (\org -> "#"++org) CLID.affiliationAnchors) ++
                                                                   (map (\org -> "org="++org) CLID.affiliationAnchors)
                                         affiliationURLs = M.filter (\vs -> any (\v -> anyInfix v affiliationURLPatterns) vs) urlDB
                                     in M.toList $ M.filter (\v -> length (filter (\v' -> not (anyInfix v' C.duplicateAffiliationWhitelist)) v) > 1) affiliationURLs

-- how do we handle files with appended data, which are linked like '/doc/reinforcement-learning/model-free/2020-bellemare.pdf#google' but exist as files as '/doc/reinforcement-learning/model-free/2020-bellemare.pdf'? We can't just look up the *filename* because it's missing the # fragment, and the annotation is usually for the full path including the fragment. If a lookup fails, we fallback to looking for any annotation with the file as a *prefix*, and accept the first match.
lookupFallback :: Metadata -> String -> (FilePath, MetadataItem)
lookupFallback m u = case M.lookup u m of
                       Nothing -> tryPrefix
                       Just ("","","",_,_,_,"") -> tryPrefix
                       Just mi -> (u,mi)
                       where tryPrefix = let possibles =  M.filterWithKey (\url _ -> u `isPrefixOf` url && url /= u) m
                                             u' = if M.size possibles > 0 then fst $ head $ M.toList possibles else u
                                         in
                                               (if (".md" `isInfixOf` u') || (u == u') then (u, ("", "", "", "", [], [], "")) else
                                                  -- sometimes the fallback is useless eg, a link to a section will trigger a 'longer' hit, like
                                                  -- '/review/cat.md' will trigger a fallback to /fuzz-testing'; the
                                                  -- longer hit will also be empty, usually, and so not better. We check for that case and return
                                                  -- the original path and not the longer path.
                                                  let possibleFallback = lookupFallback m u' in
                                                    if snd possibleFallback == ("", "", "", "", [], [], "") then (u, ("", "", "", "", [], [], "")) else
                                                      (u',snd possibleFallback))


-------------------------------------------------------------------------------------------------------------------------------

sortItemDate :: [MetadataItem] -> [MetadataItem]
sortItemDate = reverse . sortBy (flip compare `on` third)

sortItemPathDate :: [(Path,(MetadataItem,String))] -> [(Path,(MetadataItem,String))]
sortItemPathDate = reverse . sortBy (flip compare `on` (third . fst . snd))

third :: (a,b,c,d,dc,e,f) -> c
third (_,_,rd,_,_,_,_) = rd

sortItemPathDateCreated :: MetadataList -> MetadataList
sortItemPathDateCreated = sortBy (flip compare `on` (fourth . snd))

fourth :: (a,b,c,d,e,f,g) -> d
fourth (_,_,_,th,_,_,_) = th

sortItemPathDateModified :: MetadataList -> MetadataList
sortItemPathDateModified = reverse . sortBy (flip compare `on` (fourth . snd))

-- Modified version of `sortItemPathDateModified`: sort by date modified and then (within date) by path
sortItemDateModified :: MetadataList -> MetadataList
sortItemDateModified = sortBy (\(pathA, itemA) (pathB, itemB) ->
                                let dateCompare = compare (fourth itemB) (fourth itemA) in
                                  if dateCompare == EQ then compare pathA pathB
                                  else dateCompare)

-- sort a list of entries in ascending order using the annotation last-modified date when available (as 'YYYY[-MM[-DD]]', which string-sorts correctly), and falling back to sorting on the filenames ('YYYY-author.pdf').
-- We generally prefer to reverse this to descending order, to show newest-first.
-- For cases where only alphabetic sorting is available, we fall back to alphabetical order on the URL.
sortByDateModified :: [(FilePath, MetadataItem, FilePath)] -> [(FilePath, MetadataItem, FilePath)]
sortByDateModified = sortBy compareEntries
  where
    compareEntries (f, (_, _, _, d, _, _, _), _) (f', (_, _, _, d', _, _, _), _)
      | not (null d) || not (null d') = compare d' d -- Reverse order for dates, to show newest first
      | head f  == '/' && head f' == '/' = compare f' f -- Reverse order for file paths when both start with '/'
      | head f  == '/' = LT -- '/' paths come after non '/' paths
      | head f' == '/' = GT -- non '/' paths come before '/' paths
      | otherwise = compare f f' -- Alphabetical order for the rest

sortByDatePublished :: [(FilePath, MetadataItem, FilePath)] -> [(FilePath, MetadataItem, FilePath)]
sortByDatePublished = sortBy compareEntries
  where
    compareEntries (f, (_, _, d, _, _, _, _), _) (f', (_, _, d', _, _, _, _), _)
      | not (null d) || not (null d') = compare d' d -- Reverse order for dates, to show newest first
      | head f == '/' && head f' == '/' = compare f' f -- Reverse order for file paths when both start with '/'
      | head f == '/' = LT -- '/' paths come after non '/' paths
      | head f' == '/' = GT -- non '/' paths come before '/' paths
      | otherwise = compare f f' -- Alphabetical order for the rest
