diff options
| author | Charlotte Koch <charlotte@magentastripe.com> | 2025-05-13 15:57:41 -0700 |
|---|---|---|
| committer | Charlotte Koch <charlotte@magentastripe.com> | 2025-05-13 15:57:41 -0700 |
| commit | 09749920569e7426e028cc284404d076526b4bb1 (patch) | |
| tree | 9bc679e6c041a4ae49722548e7d4b2daf3971ee8 /private/ensmallen_entities_json.rb | |
| parent | f5c65e7566b9ee4fb15e86d673ee684a3bee407c (diff) | |
WIL-6 WIP better unicode substitutionsbetter_unicode
Diffstat (limited to 'private/ensmallen_entities_json.rb')
| -rw-r--r-- | private/ensmallen_entities_json.rb | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/private/ensmallen_entities_json.rb b/private/ensmallen_entities_json.rb new file mode 100644 index 0000000..8367b6a --- /dev/null +++ b/private/ensmallen_entities_json.rb @@ -0,0 +1,33 @@ +# +# The entities.json file from WHATWG contains a bunch of duplicates. In +# particular, I don't care about entities that aren't terminated with a +# semicolon -- i.e., Willora users MUST terminate HTML codes with a +# semicolon. +# +# This script massages entites.json into a smaller and (in my opinion) +# equivalent file. +# +# References: +# +# - WHATWG's table of named chars +# https://html.spec.whatwg.org/multipage/named-characters.html +# +# - URL for the entities.json itself +# https://html.spec.whatwg.org/entities.json +# + +require 'json' + +def semicoloned?(str) + return str[-1] == ";" +end + +arg = ARGV.shift + +entities = JSON.load(File.read(arg)) + +new_entities = entities. + select { |k, v| semicoloned?(k) }. + map { |k, v| v.delete("characters"); [k, v] } + +puts JSON.generate(Hash[new_entities]) |
