From 09749920569e7426e028cc284404d076526b4bb1 Mon Sep 17 00:00:00 2001
From: Charlotte Koch <charlotte@magentastripe.com>
Date: Tue, 13 May 2025 15:57:41 -0700
Subject: WIL-6 WIP better unicode substitutions

---
 private/ensmallen_entities_json.rb | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 private/ensmallen_entities_json.rb

(limited to 'private/ensmallen_entities_json.rb')

diff --git a/private/ensmallen_entities_json.rb b/private/ensmallen_entities_json.rb
new file mode 100644
index 0000000..8367b6a
--- /dev/null
+++ b/private/ensmallen_entities_json.rb
@@ -0,0 +1,33 @@
+#
+# The entities.json file from WHATWG contains a bunch of duplicates. In
+# particular, I don't care about entities that aren't terminated with a
+# semicolon -- i.e., Willora users MUST terminate HTML codes with a
+# semicolon.
+#
+# This script massages entites.json into a smaller and (in my opinion)
+# equivalent file.
+#
+# References:
+#
+# - WHATWG's table of named chars
+#     https://html.spec.whatwg.org/multipage/named-characters.html
+#
+# - URL for the entities.json itself
+#     https://html.spec.whatwg.org/entities.json
+#
+
+require 'json'
+
+def semicoloned?(str)
+  return str[-1] == ";"
+end
+
+arg = ARGV.shift
+
+entities = JSON.load(File.read(arg))
+
+new_entities = entities.
+  select { |k, v| semicoloned?(k) }.
+  map { |k, v| v.delete("characters"); [k, v] }
+
+puts JSON.generate(Hash[new_entities])
-- 
cgit v1.2.3