From 31a6527fe830734191a7fc9ff78b6c0a5140688f Mon Sep 17 00:00:00 2001 From: Charlotte Koch Date: Sun, 6 Apr 2025 13:04:30 -0700 Subject: More nuanced way of handling ellipses --- script/unicodify.sed | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/script/unicodify.sed b/script/unicodify.sed index 4668732..138003e 100644 --- a/script/unicodify.sed +++ b/script/unicodify.sed @@ -1,6 +1,12 @@ -# Remove spaces on either end of an em-dash or ellipsis. -s,[[:space:]]--[[:space:]],\&\#8212;,g -s,[[:space:]]\.\.\.[[:space:]],\&\#8230;,g +# Remove Asciidoc comments now, for the sake of getting more accurate +# wordcounts. +s,//.*,,g + +# Remove spaces on either end of an em-dash. +s,[[:space:]]*--[[:space:]]*,\&\#8212;,g + +# Remove spaces before an ellipsis, while ensuring one space after. +s,[[:space:]]*\.\.\.[[:space:]]*,\&\#8230;\ ,g # Explicitly handle curly double quotes before curly single quotes. s,"`,\&\#8220;,g @@ -14,6 +20,7 @@ s,\è,\&\#232;,g s,\é,\&\#233;,g s,\ï,\&\#239;,g -# Remove Asciidoc comments now, for the sake of getting more accurate -# wordcounts. -s,//.*,,g +# Remove spaces before a close-quote, which might have accidentally been +# introduced while converting ellipses earlier. +s,[[:space:]]*\&\#8221;,\&\#8221;,g +s,\ \&\#8221;,\&\#8221;,g -- cgit v1.2.3