From 2430b54672cb934e97c3e2c527aab900f4f32b09 Mon Sep 17 00:00:00 2001 From: Sergey Gulin Date: Fri, 23 Dec 2022 21:08:51 +0300 Subject: [PATCH 1/3] [#93] Add support for basic html tags Problem: HTML tags can be used in a markdown file. We should add support for basic HTML tags embedded in markdown file. Solution: Add support for image tags (), anchor hyperlinks (Text) and anchor target locations ( or ). --- CHANGES.md | 3 + src/Xrefcheck/Scanners/Markdown.hs | 133 +++++++++++++++++------- tests/golden/check-html/check-html.md | 24 +++++ tests/golden/check-html/check.html.bats | 4 +- tests/golden/check-html/expected.gold | 67 ++++++++++++ 5 files changed, 194 insertions(+), 37 deletions(-) create mode 100644 tests/golden/check-html/expected.gold diff --git a/CHANGES.md b/CHANGES.md index a4d037eb..9f408dac 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -41,6 +41,9 @@ Unreleased * [#231](https://github.com/serokell/xrefcheck/pull/231) + Anchor analysis takes now into account the appropriate case-sensitivity depending on the configured Markdown flavour. +* [#259](https://github.com/serokell/xrefcheck/pull/259) + + Add support for image tags ``, anchor hyperlinks `Text` + and anchor target locations `` or ``. 0.2.2 ========== diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index bcd6d40f..87ac6bd1 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -22,6 +22,7 @@ import Universum import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes) import Control.Lens (_Just, makeLenses, makeLensesFor, (.=)) +import Control.Monad.Trans.RWS.CPS qualified as RWS import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell) import Data.Aeson (FromJSON (..), genericParseJSON) import Data.ByteString.Lazy qualified as BSL @@ -195,6 +196,12 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove (IMSLink _, IMAGE {}) -> do ssIgnore .= Nothing return defNode + (IMSLink _, HTML_INLINE text) | isLink text -> do + ssIgnore .= Nothing + pure defNode + (IMSLink _, HTML_BLOCK text) | isLink text -> do + ssIgnore .= Nothing + pure defNode (IMSLink ignoreLinkState, _) -> do when (ignoreLinkState == ExpectingLinkInSubnodes) $ ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink @@ -264,6 +271,18 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove pure node (node, _) -> pure node +findAttributes :: [Text] -> [Attribute Text] -> Maybe Text +findAttributes (map T.toLower -> attrs) = + fmap snd . find (\(attr, _) -> T.toLower attr `elem` attrs) + +isLink :: Text -> Bool +isLink (parseTags -> tags) = case safeHead tags of + Just (TagOpen tag attrs) -> + T.toLower tag == "a" && isJust (findAttributes ["href"] attrs) + || T.toLower tag == "img" && isJust (findAttributes ["src"] attrs) + _ -> False + + -- | Custom `foldMap` for source tree. foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a foldNode action node@(Node _ _ subs) = do @@ -271,50 +290,41 @@ foldNode action node@(Node _ _ subs) = do b <- concatForM subs (foldNode action) return (a <> b) -type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError 'Parse]) a +type ExtractorM a = RWS.RWS MarkdownConfig [ScanError 'Parse] (Maybe Reference) a -- | Extract information from source tree. nodeExtractInfo :: Node -> ExtractorM FileInfo nodeExtractInfo input@(Node _ _ nSubs) = do if checkIgnoreAllFile nSubs then return (diffToFileInfo mempty) - else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored input)) + else diffToFileInfo <$> (foldNode extractor =<< (RWS.writer . runWriter $ removeIgnored input)) where extractor :: Node -> ExtractorM FileInfoDiff - extractor node@(Node pos ty _) = - case ty of - HTML_BLOCK _ -> do - return mempty + extractor node@(Node pos ty _) = do + reference' <- RWS.get + -- If current state is not `Nothing`, try extracting associated text + let fileInfoDiff = case (reference', ty) of + (Just ref, TEXT text) -> + mempty & fidReferences .~ DList.singleton ref {rName = text} + (Just ref, _) -> mempty & fidReferences .~ DList.singleton ref + _ -> mempty + RWS.put Nothing + fmap (fileInfoDiff <>) case ty of + HTML_BLOCK text | isLink text -> extractHtmlLink text + + HTML_BLOCK text -> extractAnchor text HEADING lvl -> do - flavor <- asks mcFlavor + flavor <- RWS.asks mcFlavor let aType = HeaderAnchor lvl let aName = headerToAnchor flavor $ nodeExtractText node let aPos = toPosition pos return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos} - HTML_INLINE text -> do - let - mName = do - tag <- safeHead $ parseTags text - attributes <- case tag of - TagOpen a attrs - | T.toLower a == "a" -> Just attrs - _ -> Nothing - (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes - pure name + HTML_INLINE text | isLink text -> extractHtmlLink text - case mName of - Just aName -> do - let aType = HandAnchor - aPos = toPosition pos - return $ FileInfoDiff - mempty - (pure $ Anchor {aType, aName, aPos}) - - Nothing -> do - return mempty + HTML_INLINE text -> extractAnchor text LINK url _ -> extractLink url @@ -328,10 +338,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do rPos = toPosition pos link = if null url then rName else url - let (rLink, rAnchor) = case T.splitOn "#" link of - [t] -> (t, Nothing) - t : ts -> (t, Just $ T.intercalate "#" ts) - [] -> error "impossible" + let (rLink, rAnchor) = splitLink link let rInfo = referenceInfo rLink @@ -339,6 +346,63 @@ nodeExtractInfo input@(Node _ _ nSubs) = do (DList.singleton $ Reference {rName, rPos, rLink, rAnchor, rInfo}) DList.empty + extractAnchor :: Text -> ExtractorM FileInfoDiff + extractAnchor text = do + let mName = do + tag <- safeHead $ parseTags text + attributes <- case tag of + TagOpen a attrs | T.toLower a == "a" -> Just attrs + _ -> Nothing + (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes + pure name + + case mName of + Just aName -> do + let aType = HandAnchor + aPos = toPosition pos + return $ FileInfoDiff + mempty + (pure $ Anchor {aType, aName, aPos}) + + Nothing -> do + return mempty + + extractHtmlReference :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference + extractHtmlReference attrs tagPos = fromMaybe mempty do + link <- findAttributes ["href"] attrs + let (rLink, rAnchor) = splitLink link + pure . DList.singleton $ Reference "" rLink rAnchor (toPosition tagPos) (referenceInfo rLink) + + splitLink :: Text -> (Text, Maybe Text) + splitLink link = case T.splitOn "#" link of + [t] -> (t, Nothing) + t : ts -> (t, Just $ T.intercalate "#" ts) + [] -> error "impossible" + + extractHtmlImage :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference + extractHtmlImage attrs tagPos = fromMaybe mempty do + link <- findAttributes ["src"] attrs + pure . DList.singleton $ Reference "" link Nothing (toPosition tagPos) (referenceInfo link) + + extractHtmlLink :: Text -> ExtractorM FileInfoDiff + extractHtmlLink text = + case safeHead $ parseTags text of + Just (TagOpen tag attrs) | T.toLower tag == "img" -> + pure $ mempty & fidReferences .~ extractHtmlImage attrs pos + Just (TagOpen tag attrs) | T.toLower tag == "a" -> do + let reference = extractHtmlReference attrs pos + case DList.toList reference of + [ref] -> do + -- The `cmark-gfm` package parses the link tag as three separate nodes: + -- `HTML_INLINE` with an opening tag, a `TEXT` with text in between, + -- and `HTML_INLINE` with a closing tag. So we keep the extracted link in a state and + -- try to get associated text in the next node. + RWS.put $ Just ref + pure mempty + _ -> pure mempty + _ -> pure mempty + + -- | Check if there is `ignore all` at the beginning of the file, -- ignoring preceding comments if there are any. checkIgnoreAllFile :: [Node] -> Bool @@ -406,11 +470,10 @@ textToMode _ = NotAnAnnotation parseFileInfo :: MarkdownConfig -> LT.Text -> (FileInfo, [ScanError 'Parse]) parseFileInfo config input - = runWriter - $ flip runReaderT config - $ nodeExtractInfo + = RWS.evalRWS + (nodeExtractInfo $ commonmarkToNode [optFootnotes] [extAutolink] - $ toStrict input + $ toStrict input) config Nothing markdownScanner :: MarkdownConfig -> ScanAction markdownScanner config canonicalFile = diff --git a/tests/golden/check-html/check-html.md b/tests/golden/check-html/check-html.md index 957aa5fc..650fd5c9 100644 --- a/tests/golden/check-html/check-html.md +++ b/tests/golden/check-html/check-html.md @@ -6,8 +6,32 @@ ## Title1 + + +text text + [One](#one) [Two](#two) [Three](#three) [Four](#four) [Five](#five) +[Six](#six) +[Seven](#seven) + +Output sample + +text Output sample text + +serokell + +text serokell text + +Six + +text Seven text + + +serokell404 + + +text serokell404 text diff --git a/tests/golden/check-html/check.html.bats b/tests/golden/check-html/check.html.bats index 06ea97ca..5a0e91a0 100644 --- a/tests/golden/check-html/check.html.bats +++ b/tests/golden/check-html/check.html.bats @@ -11,7 +11,7 @@ load '../helpers' @test "All HTML anchors should be valid" { - run xrefcheck + to_temp xrefcheck -v - assert_output --partial "All repository links are valid." + assert_diff expected.gold } diff --git a/tests/golden/check-html/expected.gold b/tests/golden/check-html/expected.gold new file mode 100644 index 00000000..0d394cf6 --- /dev/null +++ b/tests/golden/check-html/expected.gold @@ -0,0 +1,67 @@ +=== Repository data === + + check-html.md: + - references: + - reference (file-local) at src:13:1-11: + - text: "One" + - link: - + - anchor: one + - reference (file-local) at src:14:1-11: + - text: "Two" + - link: - + - anchor: two + - reference (file-local) at src:15:1-15: + - text: "Three" + - link: - + - anchor: three + - reference (file-local) at src:16:1-13: + - text: "Four" + - link: - + - anchor: four + - reference (file-local) at src:17:1-13: + - text: "Five" + - link: - + - anchor: five + - reference (file-local) at src:18:1-11: + - text: "Six" + - link: - + - anchor: six + - reference (file-local) at src:19:1-15: + - text: "Seven" + - link: - + - anchor: seven + - reference (external) at src:21:1-144: + - text: "" + - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png + - anchor: - + - reference (external) at src:23:6-149: + - text: "" + - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png + - anchor: - + - reference (external) at src:25:1-29: + - text: "serokell" + - link: https://serokell.io/ + - anchor: - + - reference (external) at src:27:6-34: + - text: "serokell" + - link: https://serokell.io/ + - anchor: - + - reference (file-local) at src:29:1-13: + - text: "Six" + - link: - + - anchor: six + - reference (file-local) at src:31:6-20: + - text: "Seven" + - link: - + - anchor: seven + - anchors: + - title1 (header II) at src:7:1-96 + - one (hand made) at src:7:4-17 + - two (hand made) at src:7:19-30 + - three (hand made) at src:7:32-47 + - four (hand made) at src:7:49-63 + - five (hand made) at src:7:69-88 + - six (hand made) at src:9:1-12 + - seven (hand made) at src:11:6-17 + +All repository links are valid. From d863dcf33462acbfdc9ca0d16e43fe364135a50a Mon Sep 17 00:00:00 2001 From: Sergey Gulin Date: Fri, 23 Dec 2022 21:54:24 +0300 Subject: [PATCH 2/3] fixup! [#93] Add support for basic html tags --- .xrefcheck.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.xrefcheck.yaml b/.xrefcheck.yaml index 748c1dbb..dc63105f 100644 --- a/.xrefcheck.yaml +++ b/.xrefcheck.yaml @@ -6,7 +6,7 @@ exclusions: ignore: - tests/markdowns/**/* - tests/golden/**/* - - docs/output-sample/**/* + - docs/output-sample/**/*.md scanners: markdown: From 15caf4bae4adbeaf907cbcabdf7264c995a28a34 Mon Sep 17 00:00:00 2001 From: Sergey Gulin Date: Thu, 29 Dec 2022 00:53:00 +0300 Subject: [PATCH 3/3] fixup! fixup! [#93] Add support for basic html tags --- src/Xrefcheck/Scanners/Markdown.hs | 7 ++----- tests/golden/check-html/check-html.md | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 87ac6bd1..55e624b1 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -273,7 +273,7 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove findAttributes :: [Text] -> [Attribute Text] -> Maybe Text findAttributes (map T.toLower -> attrs) = - fmap snd . find (\(attr, _) -> T.toLower attr `elem` attrs) + fmap snd . find ((`elem` attrs) . T.toLower . fst) isLink :: Text -> Bool isLink (parseTags -> tags) = case safeHead tags of @@ -282,7 +282,6 @@ isLink (parseTags -> tags) = case safeHead tags of || T.toLower tag == "img" && isJust (findAttributes ["src"] attrs) _ -> False - -- | Custom `foldMap` for source tree. foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a foldNode action node@(Node _ _ subs) = do @@ -353,8 +352,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do attributes <- case tag of TagOpen a attrs | T.toLower a == "a" -> Just attrs _ -> Nothing - (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes - pure name + findAttributes ["name", "id"] attributes case mName of Just aName -> do @@ -402,7 +400,6 @@ nodeExtractInfo input@(Node _ _ nSubs) = do _ -> pure mempty _ -> pure mempty - -- | Check if there is `ignore all` at the beginning of the file, -- ignoring preceding comments if there are any. checkIgnoreAllFile :: [Node] -> Bool diff --git a/tests/golden/check-html/check-html.md b/tests/golden/check-html/check-html.md index 650fd5c9..8ab27fc9 100644 --- a/tests/golden/check-html/check-html.md +++ b/tests/golden/check-html/check-html.md @@ -35,3 +35,9 @@ text Seven text text serokell404 text + + + + + +text text