WIL-6 WIP better unicode substitutionsbetter_unicode

author: Charlotte Koch <charlotte@magentastripe.com> 2025-05-13 15:57:41 -0700
committer: Charlotte Koch <charlotte@magentastripe.com> 2025-05-13 15:57:41 -0700
commit: 09749920569e7426e028cc284404d076526b4bb1 (patch)
tree: 9bc679e6c041a4ae49722548e7d4b2daf3971ee8 /script/unicodify.rb
parent: f5c65e7566b9ee4fb15e86d673ee684a3bee407c (diff)
1 files changed, 23 insertions, 0 deletions
diff --git a/script/unicodify.rb b/script/unicodify.rb
new file mode 100644
index 0000000..aac8cbd
--- /dev/null
+++ b/script/unicodify.rb
@@ -0,0 +1,23 @@
+#
+# unicodify.rb
+# Charlotte Koch <charlotte@magentastripe.com>
+#
+# This file is part of Willora.
+#
+# This script translates HTML entities on the standard input to numerical
+# Unicode codepoints on the standard output. This script uses a whole bunch
+# of memory in order to keep it fast.
+#
+
+require 'json'
+
+entities = JSON.load(File.read("./private/entities.min.json"))
+
+out = $stdin.read
+
+entities.each do |entity, value|
+  result = value["codepoints"].map { |n| sprintf('&#%d;', n) }.join("")
+  out.gsub!(entity, result)
+end
+
+$stdout.write(out)
author	Charlotte Koch <charlotte@magentastripe.com>	2025-05-13 15:57:41 -0700
committer	Charlotte Koch <charlotte@magentastripe.com>	2025-05-13 15:57:41 -0700
commit	09749920569e7426e028cc284404d076526b4bb1 (patch)
tree	9bc679e6c041a4ae49722548e7d4b2daf3971ee8 /script/unicodify.rb
parent	f5c65e7566b9ee4fb15e86d673ee684a3bee407c (diff)