summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs33
-rw-r--r--src/Text/Pandoc/Readers/Docx/Util.hs22
-rw-r--r--test/Tests/Readers/Docx.hs4
-rw-r--r--test/docx/text_in_shape_format.docxbin0 -> 42504 bytes
-rw-r--r--test/docx/text_in_shape_format.native33
5 files changed, 92 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index e83d764fd..4dc9543cf 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -148,13 +148,46 @@ mapD f xs =
in
concatMapM handler xs
+isAltContentRun :: NameSpaces -> Element -> Bool
+isAltContentRun ns element
+ | isElem ns "w" "r" element
+ , Just _altContentElem <- findChildByName ns "mc" "AlternateContent" element
+ = True
+ | otherwise
+ = False
+
+-- Elements such as <w:shape> are not always preferred
+-- to be unwrapped. Only if they are part of an AlternateContent
+-- element, they should be unwrapped.
+-- This strategy prevents VML images breaking.
+unwrapAlternateContentElement :: NameSpaces -> Element -> [Element]
+unwrapAlternateContentElement ns element
+ | isElem ns "mc" "AlternateContent" element
+ || isElem ns "mc" "Fallback" element
+ || isElem ns "w" "pict" element
+ || isElem ns "v" "group" element
+ || isElem ns "v" "rect" element
+ || isElem ns "v" "roundrect" element
+ || isElem ns "v" "shape" element
+ || isElem ns "v" "textbox" element
+ || isElem ns "w" "txbxContent" element
+ = concatMap (unwrapAlternateContentElement ns) (elChildren element)
+ | otherwise
+ = unwrapElement ns element
+
unwrapElement :: NameSpaces -> Element -> [Element]
unwrapElement ns element
| isElem ns "w" "sdt" element
, Just sdtContent <- findChildByName ns "w" "sdtContent" element
= concatMap (unwrapElement ns) (elChildren sdtContent)
+ | isElem ns "w" "r" element
+ , Just alternateContentElem <- findChildByName ns "mc" "AlternateContent" element
+ = unwrapAlternateContentElement ns alternateContentElem
| isElem ns "w" "smartTag" element
= concatMap (unwrapElement ns) (elChildren element)
+ | isElem ns "w" "p" element
+ , Just (modified, altContentRuns) <- extractChildren element (isAltContentRun ns)
+ = (unwrapElement ns modified) ++ concatMap (unwrapElement ns) altContentRuns
| otherwise
= [element{ elContent = concatMap (unwrapContent ns) (elContent element) }]
diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
index 0a869bba8..f373c5330 100644
--- a/src/Text/Pandoc/Readers/Docx/Util.hs
+++ b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -21,12 +21,14 @@ module Text.Pandoc.Readers.Docx.Util (
, findChildrenByName
, findElementByName
, findAttrByName
+ , extractChildren
) where
import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light
import qualified Data.Map as M
+import Data.List (partition)
type NameSpaces = M.Map Text Text
@@ -67,3 +69,23 @@ findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el
+
+
+-- | Removes child elements that satisfy a given condition.
+-- Returns the modified element and the list of removed children.
+extractChildren :: Element -> (Element -> Bool) -> Maybe (Element, [Element])
+extractChildren el condition
+ | null removedChildren = Nothing -- No children removed, return Nothing
+ | otherwise = Just (modifiedElement, removedChildren) -- Children removed, return Just
+ where
+ -- Separate the children based on the condition
+ (removedChildren, keptChildren) = partition condition (onlyElems' $ elContent el)
+
+ -- Helper function to filter only Element types from Content
+ onlyElems' :: [Content] -> [Element]
+ onlyElems' = foldr (\c acc -> case c of
+ Elem e -> e : acc
+ _ -> acc) []
+
+ -- Reconstruct the element with the kept children
+ modifiedElement = el { elContent = map Elem keptChildren } \ No newline at end of file
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index d9935967f..05b69abf6 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -235,6 +235,10 @@ tests = [ testGroup "document"
"collapse overlapping targets (anchor spans)"
"docx/overlapping_targets.docx"
"docx/overlapping_targets.native"
+ , testCompare
+ "text in shape format"
+ "docx/text_in_shape_format.docx"
+ "docx/text_in_shape_format.native"
]
, testGroup "blocks"
[ testCompare
diff --git a/test/docx/text_in_shape_format.docx b/test/docx/text_in_shape_format.docx
new file mode 100644
index 000000000..763e62bf6
--- /dev/null
+++ b/test/docx/text_in_shape_format.docx
Binary files differ
diff --git a/test/docx/text_in_shape_format.native b/test/docx/text_in_shape_format.native
new file mode 100644
index 000000000..09cfa932c
--- /dev/null
+++ b/test/docx/text_in_shape_format.native
@@ -0,0 +1,33 @@
+[ Para
+ [ Str "Last"
+ , Space
+ , Str "update:"
+ , Space
+ , Str "May"
+ , Space
+ , Str "1,"
+ , Space
+ , Str "2017"
+ ]
+, Para
+ [ Str "U"
+ , Str "sing"
+ , Space
+ , Str "Microsoft"
+ , Space
+ , Str "Word"
+ , Space
+ , Str "2007/2010"
+ , LineBreak
+ , Str "for"
+ , Space
+ , Str "Writing"
+ , Space
+ , Str "Technical"
+ , Space
+ , Str "Documents"
+ ]
+, Para [ Str "Valter" , Space , Str "Kiisk" ]
+, Para [ Str "Institute" , Space , Str "of" , Space , Str "Physics," , Space , Str "University" , Space , Str "of" , Space , Str "Tartu" ]
+, Para []
+] \ No newline at end of file