From 09749920569e7426e028cc284404d076526b4bb1 Mon Sep 17 00:00:00 2001 From: Charlotte Koch Date: Tue, 13 May 2025 15:57:41 -0700 Subject: WIL-6 WIP better unicode substitutions --- private/ensmallen_entities_json.rb | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 private/ensmallen_entities_json.rb (limited to 'private/ensmallen_entities_json.rb') diff --git a/private/ensmallen_entities_json.rb b/private/ensmallen_entities_json.rb new file mode 100644 index 0000000..8367b6a --- /dev/null +++ b/private/ensmallen_entities_json.rb @@ -0,0 +1,33 @@ +# +# The entities.json file from WHATWG contains a bunch of duplicates. In +# particular, I don't care about entities that aren't terminated with a +# semicolon -- i.e., Willora users MUST terminate HTML codes with a +# semicolon. +# +# This script massages entites.json into a smaller and (in my opinion) +# equivalent file. +# +# References: +# +# - WHATWG's table of named chars +# https://html.spec.whatwg.org/multipage/named-characters.html +# +# - URL for the entities.json itself +# https://html.spec.whatwg.org/entities.json +# + +require 'json' + +def semicoloned?(str) + return str[-1] == ";" +end + +arg = ARGV.shift + +entities = JSON.load(File.read(arg)) + +new_entities = entities. + select { |k, v| semicoloned?(k) }. + map { |k, v| v.delete("characters"); [k, v] } + +puts JSON.generate(Hash[new_entities]) -- cgit v1.2.3