#!/usr/bin/env runghc
{-# LANGUAGE OverloadedStrings #-}
module Main where

-- Generate "link bibliographies" for Gwern.net pages: automatically-generated bibliographies of references (links).
-- To avoid confusing readers, they are just called 'bibliographies' in the UI.
--
-- Link bibliographies are similar to directory indexes in compiling a list of all links on a
-- Gwern.net page/essay, in order, with their annotations (where available). They are the
-- forward-citation dual of backlinks (all links pointing *out* rather than *in),
-- are much easier to synoptically browse than mousing over links one at a time,
-- and can help provide a static version of the page (ie. download page + link
-- bibliography to preserve the annotations).
--
-- Link bibliographies are generated by parsing each $PAGE (provided in default.html as '$url$'),
-- filtering for Links using the Pandoc API, querying the metadata, generating a numbered list of
-- links, and then writing out the generated Markdown file to 'metadata/annotation/link-bibliography/$ESCAPED($PAGE).html'.
-- They are compiled like normal pages by Hakyll, and they are exposed to readers as an additional
-- link in the page metadata block, paired with the backlinks.

import Control.Monad (when)
import Data.List (isPrefixOf, isSuffixOf, sort, (\\))
import Data.Containers.ListUtils (nubOrd)
import qualified Data.Map as M (lookup, keys)
import System.FilePath (takeDirectory, takeFileName)

import Data.Text.IO as TIO (readFile)
import qualified Data.Text as T (pack, unpack)
import System.Directory (doesFileExist, getModificationTime)
import Control.Monad.Parallel as Par (mapM_)

import Text.Pandoc (Inline(Code, Link, RawInline, Span, Str, Strong, Space), Format(Format), def, nullAttr, nullMeta, readMarkdown, readerExtensions, writerExtensions, runPure, pandocExtensions, ListNumberDelim(DefaultDelim), ListNumberStyle(DefaultStyle), Block(BlockQuote, Div, OrderedList, Para), Pandoc(..), writeHtml5String)
import Text.Pandoc.Walk (walk)

import LinkArchive (readArchiveMetadata, ArchiveMetadata)
import LinkBacklink (getLinkBibLink, getAnnotationLinkCheck)
import LinkID (metadataItem2ID)
import LinkMetadata (generateAnnotationTransclusionBlock, readLinkMetadata, hasAnnotation, isPagePath)
import LinkMetadataTypes (Metadata, MetadataItem)
import Query (extractLinkIDsWith)
import Typography (typographyTransformTemporary, titlecase')
import Utils (writeUpdatedFile, replace, printRed, toPandoc)
import Interwiki (convertInterwikiLinks)
import qualified Config.Misc as C (mininumLinkBibliographyFragment)
import Inflation (isInflationURL)

main :: IO ()
main = do md <- readLinkMetadata
          am <- readArchiveMetadata
          -- build HTML fragments for each page or annotation link, containing just the list and no header/full-page wrapper, so they are nice to transclude *into* popups:
          Par.mapM_ (writeLinkBibliographyFragment am md) $ sort $ M.keys md

writeLinkBibliographyFragment :: ArchiveMetadata -> Metadata -> FilePath -> IO ()
writeLinkBibliographyFragment am md path =
  case M.lookup path md of
    Nothing -> return ()
    Just (_,_,_,_,_,_,"") -> return ()
    Just (_,_,_,_,_,_,abstract) -> do
      let self = takeWhile (/='#') path
      let selfAbsolute = "https://gwern.net" ++ self
      let (path',_) = getLinkBibLink path
      lbExists <- doesFileExist path'
      let essayp = head path == '/' && '.' `notElem` path
      shouldWrite <- if not lbExists then return True else -- if it doesn't exist, it could be arbitrarily out of date so we default to trying to write it:
                                     do target <- if essayp then return $ tail (takeWhile (/='#') path) ++ ".md"
                                                  else fmap fst (getAnnotationLinkCheck path)
                                        originalExists <- doesFileExist target
                                        if not originalExists then return True else do
                                          originalLastModified <- getModificationTime target
                                          lbLastModified       <- getModificationTime path'
                                          return (originalLastModified >= lbLastModified)

      when shouldWrite $ parseExtractCompileWrite am md path path' self selfAbsolute abstract

parseExtractCompileWrite :: ArchiveMetadata -> Metadata -> String -> FilePath -> String -> String -> String -> IO ()
parseExtractCompileWrite am md path path' self selfAbsolute abstract = do
        -- toggle between parsing the full original Markdown page, and just the annotation abstract:
        linksRaw <- if head path == '/' && '.'`notElem`path then
                      if '#' `elem` path && abstract=="" then return [] -- if it's just an empty annotation triggered by a section existing, ignore
                      else
                        extractLinksFromPage (tail (takeWhile (/='#') path) ++ ".md") -- Markdown essay
                    else return $ nubOrd $ map (\(a,b) -> (T.unpack a, T.unpack b)) $ extractLinkIDsWith (const True) (T.pack path) $ toPandoc abstract -- annotation
            -- delete self-links in essays, such as in the ToC of scraped abstracts, or newsletters linking themselves as the first link (eg. '/newsletter/2022/05' will link to 'https://gwern.net/newsletter/2022/05' at the beginning); but allow self-links for files or external URLs (eg. for when we do the hash trick to annotate different parts of the same URL/file and cross-reference them)
        let links = filter (\(l,_) -> not ((isPagePath (T.pack l) && (self `isPrefixOf` l || selfAbsolute `isPrefixOf` l)) || isInflationURL (T.pack l))) linksRaw
        when (length (filter (\(l,_) -> not ("https://en.wikipedia.org/wiki/" `isPrefixOf` l))  links) >= C.mininumLinkBibliographyFragment) $
          do

             let triplets  = linksToAnnotations md path links
                 tripletsN = length triplets
                 countString = if tripletsN < 2 then [] else [Str " (", Str (T.pack $ show tripletsN), Str ")"]
                 body = [Para [Link ("",["icon-special"], []) [Strong $ [Str "Bibliography"]++countString++[Str ":"]] ("/design#link-bibliographies", "")], generateLinkBibliographyItems am path triplets]
                 document = Pandoc nullMeta body
                 html = runPure $ writeHtml5String def{writerExtensions = pandocExtensions} $
                   walk typographyTransformTemporary $ convertInterwikiLinks $ walk (hasAnnotation md) document
             case html of
               Left e   -> printRed (show e)
               -- compare with the old version, and update if there are any differences:
               Right p' -> do when (path' == "") $ error ("generateLinkBibliography.hs: writeLinkBibliographyFragment: writing out failed because received empty path' from getLinkBibLink for original path: " ++ path)
                              writeUpdatedFile "link-bibliography-fragment" path' p'

generateLinkBibliographyItems :: ArchiveMetadata -> String -> [(String,String,MetadataItem)] -> Block
generateLinkBibliographyItems _ _ [] = Para []
generateLinkBibliographyItems am pathParent items =
 let itemsWP      = filter (\(u,_,_) -> "https://en.wikipedia.org/wiki/" `isPrefixOf` u) items
     itemsPrimary =  items \\ itemsWP
 in OrderedList (1, DefaultStyle, DefaultDelim) (map (generateLinkBibliographyItem am pathParent False) itemsPrimary ++
       -- because WP links are so numerous, and so bulky, stick them into a collapsed sub-list at the end:
       if null itemsWP then [] else [
                                     [Div ("",["collapse"],[]) [
                                         -- TODO: make these .include-content links?
                                         Para [Strong [Str "Wikipedia Bibliography:"]],
                                         OrderedList (1, DefaultStyle, DefaultDelim) (map (generateLinkBibliographyItem am pathParent True) itemsWP)]]]
                                      )
generateLinkBibliographyItem  :: ArchiveMetadata -> String -> Bool -> (String,String,MetadataItem) -> [Block]
generateLinkBibliographyItem _ pathParent False (f,ident,(t,_,_,_,_,_,""))  = -- short:
 let f'
       | "http" `isPrefixOf` f = f
       | "index" `isSuffixOf` f = takeDirectory f
       | otherwise = takeFileName f
       -- NOTE: this must include the parent URL, like '/design#gwern-sidenote' instead of just '#gwern-sidenote', otherwise it will be rewritten incorrectly when transcluded. Like in the link-bib for '/design', a href='#gwern-sidenote' would be rewritten by the transclude JS to point to the 'original within-page anchor', in this case, '/metadata/annotation/link-bibliography/%2fdesign.html', which is of course completely incorrect - we want it to point to the ID anchor in */design*.
     prefix = if null ident || not (isPagePath (T.pack f)) then [] else [Link ("",["id-not", "link-bibliography-context"],[]) [Str "\8203"] ((T.pack $ (takeWhile (/='#') pathParent) ++ "#" ++ ident), "Original context in page."), Space] -- ZERO WIDTH SPACE to make clear that 'this link intentionally left blank';
     -- Imagine we link to a target on another Gwern.net page like </question#feynman>. It has no full annotation and never will, not even a title.
     -- So it would show up in the link-bib as merely eg. '55. `/question#feynman`'. Not very useful! Why can't it simply transclude that snippet instead?
     -- So, we do that here: if it is a local page path, has an anchor `#` in it, and does not have an annotation ("" pattern-match guarantees that),'
     -- we try to append a blockquote with the `.include-block-context` class, to make it look like the backlinks approach to transcluding the context
     -- at a glance:
     transcludeTarget = if not (isPagePath (T.pack f) && '#' `elem` f) then [] else
                          [BlockQuote [Para [Link ("", ["backlink-not", "include-block-context", "link-annotated-not", "collapsible"], []) -- TODO: do we need .link-annotated-not if we are explicitly transcluding .include-block-context?
                                              [Span ("",["abstract-collapse"],[]) [Str "[Transclude the forward-link's context]"]] (T.pack f,"")]]]
     -- I skip date because files don't usually have anything better than year, and that's already encoded in the filename which is shown
 in
    if t=="" then
      Para (prefix ++ [Link ("",["id-not"],[]) [Code nullAttr (T.pack f')] (T.pack f, "")]) : transcludeTarget
    else
      Para (prefix ++ [Link ("",["id-not"],[]) [RawInline (Format "HTML") (T.pack $ titlecase' t)] (T.pack f, "")]) : transcludeTarget
-- long items:
generateLinkBibliographyItem am pathParent _ (f,ident,mi) =
  let prefix = if null ident || not (isPagePath (T.pack f)) then []
               else [Link ("",["id-not", "link-bibliography-context"],[]) [Str "\8203"] ((T.pack $ (takeWhile (/='#') pathParent) ++ "#" ++ ident), "Original context in page."), Space]
  in
    wrapWith prefix $ generateAnnotationTransclusionBlock am (f,mi)
   where wrapWith p ((Para x) : xs) = Para (p++x) : xs -- inject the prefix into the first Paragraph which is the regular link generated by the transclusion block
         wrapWith p x = error $ "generateLinkBibliography.generateLinkBibliographyItem.wrapWith: attempted rewrite of generateAnnotationTransclusionBlock failed because the pattern-match didn't fire like it's always supposed to; did the output '[Block]' change? Inputs were: prefix: " ++
                        show p ++ " : transclude block: " ++ show x

-- TODO: refactor out to Query?
extractLinksFromPage :: String -> IO [(String,String)]
extractLinksFromPage "" = error "generateLinkBibliography.extractLinksFromPage: called with an empty '' string argument—this should never happen!"
extractLinksFromPage path =
  do existsp <- doesFileExist path
     if not existsp then printRed ("generateLinkBibliography.extractLinksFromPage: file argument does not exist? tried to read: " ++ path) >> return [] -- new '/blog/' posts may not exist yet on-disk, so we warn but don't fatally error out.
       else
        do f <- TIO.readFile path
           let pE = runPure $ readMarkdown def{readerExtensions=pandocExtensions} f
           return $ case pE of
                      Left  err -> error $ "generateLinkBibliography.extractLinksFromPage: file failed Pandoc parsing; file path was: " ++ path ++ "; error message was: " ++ show err ++ "; full read file contents were: " ++ show f
                      -- make the list unique, but keep the original ordering
                      Right p   -> map (\(a,b) -> (replace "https://gwern.net/" "/" a, b)) $
                                         filter (\(l,_) -> head l /= '#') $ -- self-links are not useful in link bibliographies
                                         nubOrd $ map (\(a,b) -> (T.unpack a, T.unpack b)) $ extractLinkIDsWith (const True) (T.pack path) p

linksToAnnotations :: Metadata -> String -> [(String,String)] -- (URL, hash)
                   -> [(String,String,MetadataItem)] -- (URL, hash ID of its actual use within the parent, annotation)
linksToAnnotations _  _    [] = []
linksToAnnotations m pt items = map (linkToAnnotation m pt) items
linkToAnnotation :: Metadata -> String -> (String,String) -> (String,String,MetadataItem)
linkToAnnotation _ pt ("",ident) = error $ "generateLinkBibliography.linkToAnnotation: empty URL input; identifier: " ++ ident ++ "; parent: " ++ pt
linkToAnnotation m _ (u,ident) = case M.lookup u m of
                                 Just mi  ->
                                   let ident' = if ident /= "" then ident else T.unpack (LinkID.metadataItem2ID m u mi) in
                                     (u,ident',mi)
                                 Nothing -> let mi' = ("","","","",[],[],"")
                                                ident' = if ident /= "" then ident else T.unpack (LinkID.metadataItem2ID m u mi')
                                            in
                                                  (u,ident',mi')

