Docx reader: unwrap content of shaped textboxes...

* #9214 text in shape format test document * #9214 support Text in Shape Format * #9214 remove irrelevant code
author: Stephan Meijer <me@stephanmeijer.com> 2023-11-30 03:59:09 +0100
committer: GitHub <noreply@github.com> 2023-11-29 18:59:09 -0800
commit: 2e8ecb3fba62723fe39599f9f0cb7f7a30a5ca19 (patch)
tree: 96e23e78ccc9bc8a496328740fbfd422e02200ef /src/Text
parent: eff1790bbcce5d56e7a24a9cc2bd7c08e8961c84 (diff)
2 files changed, 55 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index e83d764fd..4dc9543cf 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -148,13 +148,46 @@ mapD f xs =
   in
    concatMapM handler xs
 
+isAltContentRun :: NameSpaces -> Element -> Bool
+isAltContentRun ns element
+  | isElem ns "w" "r" element
+  , Just _altContentElem <- findChildByName ns "mc" "AlternateContent" element
+  = True
+  | otherwise
+  = False
+
+-- Elements such as <w:shape> are not always preferred
+-- to be unwrapped. Only if they are part of an AlternateContent
+-- element, they should be unwrapped.
+-- This strategy prevents VML images breaking.
+unwrapAlternateContentElement :: NameSpaces -> Element -> [Element]
+unwrapAlternateContentElement ns element
+  | isElem ns "mc" "AlternateContent" element
+  || isElem ns "mc" "Fallback" element
+  || isElem ns "w" "pict" element
+  || isElem ns "v" "group" element
+  || isElem ns "v" "rect" element
+  || isElem ns "v" "roundrect" element
+  || isElem ns "v" "shape" element
+  || isElem ns "v" "textbox" element
+  || isElem ns "w" "txbxContent" element
+  = concatMap (unwrapAlternateContentElement ns) (elChildren element)
+  | otherwise
+  = unwrapElement ns element
+
 unwrapElement :: NameSpaces -> Element -> [Element]
 unwrapElement ns element
   | isElem ns "w" "sdt" element
   , Just sdtContent <- findChildByName ns "w" "sdtContent" element
   = concatMap (unwrapElement ns) (elChildren sdtContent)
+  | isElem ns "w" "r" element
+  , Just alternateContentElem <- findChildByName ns "mc" "AlternateContent" element
+  = unwrapAlternateContentElement ns alternateContentElem
   | isElem ns "w" "smartTag" element
   = concatMap (unwrapElement ns) (elChildren element)
+  | isElem ns "w" "p" element
+  , Just (modified, altContentRuns) <- extractChildren element (isAltContentRun ns)
+  = (unwrapElement ns modified) ++ concatMap (unwrapElement ns) altContentRuns
   | otherwise
   = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }]
 
diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
index 0a869bba8..f373c5330 100644
--- a/src/Text/Pandoc/Readers/Docx/Util.hs
+++ b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -21,12 +21,14 @@ module Text.Pandoc.Readers.Docx.Util (
                                       , findChildrenByName
                                       , findElementByName
                                       , findAttrByName
+                                      , extractChildren
                                       ) where
 
 import qualified Data.Text as T
 import Data.Text (Text)
 import Text.Pandoc.XML.Light
 import qualified Data.Map as M
+import Data.List (partition)
 
 type NameSpaces = M.Map Text Text
 
@@ -67,3 +69,23 @@ findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
 findAttrByName ns pref name el =
   let ns' = ns <> elemToNameSpaces el
   in  findAttr (elemName ns' pref name) el
+
+
+-- | Removes child elements that satisfy a given condition.
+-- Returns the modified element and the list of removed children.
+extractChildren :: Element -> (Element -> Bool) -> Maybe (Element, [Element])
+extractChildren el condition
+  | null removedChildren = Nothing  -- No children removed, return Nothing
+  | otherwise = Just (modifiedElement, removedChildren)  -- Children removed, return Just
+  where
+    -- Separate the children based on the condition
+    (removedChildren, keptChildren) = partition condition (onlyElems' $ elContent el)
+
+    -- Helper function to filter only Element types from Content
+    onlyElems' :: [Content] -> [Element]
+    onlyElems' = foldr (\c acc -> case c of
+                                   Elem e -> e : acc
+                                   _      -> acc) []
+
+    -- Reconstruct the element with the kept children
+    modifiedElement = el { elContent = map Elem keptChildren }
+\ No newline at end of file
author	Stephan Meijer <me@stephanmeijer.com>	2023-11-30 03:59:09 +0100
committer	GitHub <noreply@github.com>	2023-11-29 18:59:09 -0800
commit	2e8ecb3fba62723fe39599f9f0cb7f7a30a5ca19 (patch)
tree	96e23e78ccc9bc8a496328740fbfd422e02200ef /src/Text
parent	eff1790bbcce5d56e7a24a9cc2bd7c08e8961c84 (diff)