aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Tools/build/parse_html5_entities.py
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2022-10-17 12:01:00 +0200
committerGitHub <noreply@github.com>2022-10-17 12:01:00 +0200
commit1863302d61a7a5dd8b8d345a00f0ee242c7c10bf (patch)
treea1e41af02147e2a14155d5b19d7b68bbb31c3f6f /Tools/build/parse_html5_entities.py
parenteae7dad40255bad42e4abce53ff8143dcbc66af5 (diff)
downloadcpython-1863302d61a7a5dd8b8d345a00f0ee242c7c10bf.tar.gz
cpython-1863302d61a7a5dd8b8d345a00f0ee242c7c10bf.zip
gh-97669: Create Tools/build/ directory (#97963)
Create Tools/build/ directory. Move the following scripts from Tools/scripts/ to Tools/build/: * check_extension_modules.py * deepfreeze.py * freeze_modules.py * generate_global_objects.py * generate_levenshtein_examples.py * generate_opcode_h.py * generate_re_casefix.py * generate_sre_constants.py * generate_stdlib_module_names.py * generate_token.py * parse_html5_entities.py * smelly.py * stable_abi.py * umarshal.py * update_file.py * verify_ensurepip_wheels.py Update references to these scripts.
Diffstat (limited to 'Tools/build/parse_html5_entities.py')
-rwxr-xr-xTools/build/parse_html5_entities.py115
1 files changed, 115 insertions, 0 deletions
diff --git a/Tools/build/parse_html5_entities.py b/Tools/build/parse_html5_entities.py
new file mode 100755
index 00000000000..d2bf2909103
--- /dev/null
+++ b/Tools/build/parse_html5_entities.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+Utility for parsing HTML5 entity definitions available from:
+
+ https://html.spec.whatwg.org/entities.json
+ https://html.spec.whatwg.org/multipage/named-characters.html
+
+The page now contains the following note:
+
+ "This list is static and will not be expanded or changed in the future."
+
+Written by Ezio Melotti and Iuliia Proskurnia.
+"""
+
+import os
+import sys
+import json
+from urllib.request import urlopen
+from html.entities import html5
+
+SCRIPT_NAME = 'Tools/build/parse_html5_entities.py'
+PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
+ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
+HTML5_SECTION_START = '# HTML5 named character references'
+
+def get_json(url):
+ """Download the json file from the url and returns a decoded object."""
+ with urlopen(url) as f:
+ data = f.read().decode('utf-8')
+ return json.loads(data)
+
+def create_dict(entities):
+ """Create the html5 dict from the decoded json object."""
+ new_html5 = {}
+ for name, value in entities.items():
+ new_html5[name.lstrip('&')] = value['characters']
+ return new_html5
+
+def compare_dicts(old, new):
+ """Compare the old and new dicts and print the differences."""
+ added = new.keys() - old.keys()
+ if added:
+ print('{} entitie(s) have been added:'.format(len(added)))
+ for name in sorted(added):
+ print(' {!r}: {!r}'.format(name, new[name]))
+ removed = old.keys() - new.keys()
+ if removed:
+ print('{} entitie(s) have been removed:'.format(len(removed)))
+ for name in sorted(removed):
+ print(' {!r}: {!r}'.format(name, old[name]))
+ changed = set()
+ for name in (old.keys() & new.keys()):
+ if old[name] != new[name]:
+ changed.add((name, old[name], new[name]))
+ if changed:
+ print('{} entitie(s) have been modified:'.format(len(changed)))
+ for item in sorted(changed):
+ print(' {!r}: {!r} -> {!r}'.format(*item))
+
+def write_items(entities, file=sys.stdout):
+ """Write the items of the dictionary in the specified file."""
+ # The keys in the generated dictionary should be sorted
+ # in a case-insensitive way, however, when two keys are equal,
+ # the uppercase version should come first so that the result
+ # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
+ # To do this we first sort in a case-sensitive way (so all the
+ # uppercase chars come first) and then sort with key=str.lower.
+ # Since the sorting is stable the uppercase keys will eventually
+ # be before their equivalent lowercase version.
+ keys = sorted(entities.keys())
+ keys = sorted(keys, key=str.lower)
+ print(HTML5_SECTION_START, file=file)
+ print(f'# Generated by {SCRIPT_NAME}\n'
+ f'# from {ENTITIES_URL} and\n'
+ f'# {PAGE_URL}.\n'
+ f'# Map HTML5 named character references to the '
+ f'equivalent Unicode character(s).', file=file)
+ print('html5 = {', file=file)
+ for name in keys:
+ print(f' {name!r}: {entities[name]!a},', file=file)
+ print('}', file=file)
+
+
+if __name__ == '__main__':
+ # without args print a diff between html.entities.html5 and new_html5
+ # with --create print the new html5 dict
+ # with --patch patch the Lib/html/entities.py file
+ new_html5 = create_dict(get_json(ENTITIES_URL))
+ if '--create' in sys.argv:
+ write_items(new_html5)
+ elif '--patch' in sys.argv:
+ fname = 'Lib/html/entities.py'
+ temp_fname = fname + '.temp'
+ with open(fname) as f1, open(temp_fname, 'w') as f2:
+ skip = False
+ for line in f1:
+ if line.startswith(HTML5_SECTION_START):
+ write_items(new_html5, file=f2)
+ skip = True
+ continue
+ if skip:
+ # skip the old items until the }
+ if line.startswith('}'):
+ skip = False
+ continue
+ f2.write(line)
+ os.remove(fname)
+ os.rename(temp_fname, fname)
+ else:
+ if html5 == new_html5:
+ print('The current dictionary is updated.')
+ else:
+ compare_dicts(html5, new_html5)
+ print('Run "./python {0} --patch" to update Lib/html/entities.html '
+ 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))