diff options
author | Victor Stinner <vstinner@python.org> | 2022-10-17 12:01:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-17 12:01:00 +0200 |
commit | 1863302d61a7a5dd8b8d345a00f0ee242c7c10bf (patch) | |
tree | a1e41af02147e2a14155d5b19d7b68bbb31c3f6f /Tools/build/parse_html5_entities.py | |
parent | eae7dad40255bad42e4abce53ff8143dcbc66af5 (diff) | |
download | cpython-1863302d61a7a5dd8b8d345a00f0ee242c7c10bf.tar.gz cpython-1863302d61a7a5dd8b8d345a00f0ee242c7c10bf.zip |
gh-97669: Create Tools/build/ directory (#97963)
Create Tools/build/ directory. Move the following scripts from
Tools/scripts/ to Tools/build/:
* check_extension_modules.py
* deepfreeze.py
* freeze_modules.py
* generate_global_objects.py
* generate_levenshtein_examples.py
* generate_opcode_h.py
* generate_re_casefix.py
* generate_sre_constants.py
* generate_stdlib_module_names.py
* generate_token.py
* parse_html5_entities.py
* smelly.py
* stable_abi.py
* umarshal.py
* update_file.py
* verify_ensurepip_wheels.py
Update references to these scripts.
Diffstat (limited to 'Tools/build/parse_html5_entities.py')
-rwxr-xr-x | Tools/build/parse_html5_entities.py | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/Tools/build/parse_html5_entities.py b/Tools/build/parse_html5_entities.py new file mode 100755 index 00000000000..d2bf2909103 --- /dev/null +++ b/Tools/build/parse_html5_entities.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Utility for parsing HTML5 entity definitions available from: + + https://html.spec.whatwg.org/entities.json + https://html.spec.whatwg.org/multipage/named-characters.html + +The page now contains the following note: + + "This list is static and will not be expanded or changed in the future." + +Written by Ezio Melotti and Iuliia Proskurnia. +""" + +import os +import sys +import json +from urllib.request import urlopen +from html.entities import html5 + +SCRIPT_NAME = 'Tools/build/parse_html5_entities.py' +PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' +ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' +HTML5_SECTION_START = '# HTML5 named character references' + +def get_json(url): + """Download the json file from the url and returns a decoded object.""" + with urlopen(url) as f: + data = f.read().decode('utf-8') + return json.loads(data) + +def create_dict(entities): + """Create the html5 dict from the decoded json object.""" + new_html5 = {} + for name, value in entities.items(): + new_html5[name.lstrip('&')] = value['characters'] + return new_html5 + +def compare_dicts(old, new): + """Compare the old and new dicts and print the differences.""" + added = new.keys() - old.keys() + if added: + print('{} entitie(s) have been added:'.format(len(added))) + for name in sorted(added): + print(' {!r}: {!r}'.format(name, new[name])) + removed = old.keys() - new.keys() + if removed: + print('{} entitie(s) have been removed:'.format(len(removed))) + for name in sorted(removed): + print(' {!r}: {!r}'.format(name, old[name])) + changed = set() + for name in (old.keys() & new.keys()): + if old[name] != new[name]: + changed.add((name, old[name], new[name])) + if changed: + print('{} entitie(s) have been modified:'.format(len(changed))) + for item in sorted(changed): + print(' {!r}: {!r} -> {!r}'.format(*item)) + +def write_items(entities, file=sys.stdout): + """Write the items of the dictionary in the specified file.""" + # The keys in the generated dictionary should be sorted + # in a case-insensitive way, however, when two keys are equal, + # the uppercase version should come first so that the result + # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] + # To do this we first sort in a case-sensitive way (so all the + # uppercase chars come first) and then sort with key=str.lower. + # Since the sorting is stable the uppercase keys will eventually + # be before their equivalent lowercase version. + keys = sorted(entities.keys()) + keys = sorted(keys, key=str.lower) + print(HTML5_SECTION_START, file=file) + print(f'# Generated by {SCRIPT_NAME}\n' + f'# from {ENTITIES_URL} and\n' + f'# {PAGE_URL}.\n' + f'# Map HTML5 named character references to the ' + f'equivalent Unicode character(s).', file=file) + print('html5 = {', file=file) + for name in keys: + print(f' {name!r}: {entities[name]!a},', file=file) + print('}', file=file) + + +if __name__ == '__main__': + # without args print a diff between html.entities.html5 and new_html5 + # with --create print the new html5 dict + # with --patch patch the Lib/html/entities.py file + new_html5 = create_dict(get_json(ENTITIES_URL)) + if '--create' in sys.argv: + write_items(new_html5) + elif '--patch' in sys.argv: + fname = 'Lib/html/entities.py' + temp_fname = fname + '.temp' + with open(fname) as f1, open(temp_fname, 'w') as f2: + skip = False + for line in f1: + if line.startswith(HTML5_SECTION_START): + write_items(new_html5, file=f2) + skip = True + continue + if skip: + # skip the old items until the } + if line.startswith('}'): + skip = False + continue + f2.write(line) + os.remove(fname) + os.rename(temp_fname, fname) + else: + if html5 == new_html5: + print('The current dictionary is updated.') + else: + compare_dicts(html5, new_html5) + print('Run "./python {0} --patch" to update Lib/html/entities.html ' + 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |