summaryrefslogtreecommitdiff
path: root/private/ensmallen_entities_json.rb
diff options
context:
space:
mode:
authorCharlotte Koch <charlotte@magentastripe.com>2025-05-13 15:57:41 -0700
committerCharlotte Koch <charlotte@magentastripe.com>2025-05-13 15:57:41 -0700
commit09749920569e7426e028cc284404d076526b4bb1 (patch)
tree9bc679e6c041a4ae49722548e7d4b2daf3971ee8 /private/ensmallen_entities_json.rb
parentf5c65e7566b9ee4fb15e86d673ee684a3bee407c (diff)
WIL-6 WIP better unicode substitutionsbetter_unicode
Diffstat (limited to 'private/ensmallen_entities_json.rb')
-rw-r--r--private/ensmallen_entities_json.rb33
1 files changed, 33 insertions, 0 deletions
diff --git a/private/ensmallen_entities_json.rb b/private/ensmallen_entities_json.rb
new file mode 100644
index 0000000..8367b6a
--- /dev/null
+++ b/private/ensmallen_entities_json.rb
@@ -0,0 +1,33 @@
+#
+# The entities.json file from WHATWG contains a bunch of duplicates. In
+# particular, I don't care about entities that aren't terminated with a
+# semicolon -- i.e., Willora users MUST terminate HTML codes with a
+# semicolon.
+#
+# This script massages entites.json into a smaller and (in my opinion)
+# equivalent file.
+#
+# References:
+#
+# - WHATWG's table of named chars
+# https://html.spec.whatwg.org/multipage/named-characters.html
+#
+# - URL for the entities.json itself
+# https://html.spec.whatwg.org/entities.json
+#
+
+require 'json'
+
+def semicoloned?(str)
+ return str[-1] == ";"
+end
+
+arg = ARGV.shift
+
+entities = JSON.load(File.read(arg))
+
+new_entities = entities.
+ select { |k, v| semicoloned?(k) }.
+ map { |k, v| v.delete("characters"); [k, v] }
+
+puts JSON.generate(Hash[new_entities])