diff options
Diffstat (limited to 'tests/bytecode/mp-tests/ptex.py')
-rw-r--r-- | tests/bytecode/mp-tests/ptex.py | 269 |
1 files changed, 0 insertions, 269 deletions
diff --git a/tests/bytecode/mp-tests/ptex.py b/tests/bytecode/mp-tests/ptex.py deleted file mode 100644 index 8f23d78009..0000000000 --- a/tests/bytecode/mp-tests/ptex.py +++ /dev/null @@ -1,269 +0,0 @@ -import sys -import os -import os.path -import datetime -import argparse -from xml.etree.ElementTree import Element, SubElement, tostring - -from log import Log -from texparser import TexParser -from latexparser import LatexParser -from gettexfile import file_has_suffix -from gettexfile import get_tex_file - -from xiwi.common.misc import buildFileList -from xiwi.common import arxivid -from xiwi.common.stats import Statistics - -def str_contains(s1, s2): - return s1.find(s2) != -1 - -def str_contains_one_of(st, st_list): - for st2 in st_list: - if str_contains(st, st2): - return True - return False - -def detect_file_kind(file_obj): - """Simple detection of kind of source file.""" - kind = 'unknown' - firstline = file_obj.readline() - while firstline.isspace(): - firstline = file_obj.readline() - if firstline.startswith('%!PS'): - kind = 'PS' - elif firstline.startswith('%auto-ignore'): - kind = 'auto-ignore' - else: - file_obj.seek(0) - for line in file_obj: - if str_contains(line, '\\def'): - # might be tex, if we don't find anything else - kind = 'tex' - if str_contains(line, '\\input'): - # might be tex, if we don't find anything else - kind = 'tex' - if str_contains(line, 'amstex') or str_contains(line, 'harvmac'): - # definitely tex - kind = 'tex' - break - if str_contains(line, '\\documentclass'): - # definitely latex - kind = 'latex' - break - if str_contains(line, '\\documentstyle'): - # could be tex or latex - if str_contains(line, 'amsppt'): - kind = 'tex' - break - else: - kind = 'latex' - break - file_obj.seek(0) - return kind - -class WithdrawnPaper(object): - def __init__(self): - pass - - def __getitem__(self, item): - if item == 'refs': - return [] - elif item == 'success': - return True - - def parse(self): - pass - -def process_article(filename): - """Returns TexParserBase derived object on success, None on failure.""" - - # get the tex file - filename, file_obj, tarfile_obj = get_tex_file(filename) - if file_obj is None: - return None - - # detect the type of file - kind = detect_file_kind(file_obj) - - # act on the type of file - parser = None - if kind == 'PS': - print('skipping postscript file') - elif kind == 'auto-ignore': - print('asked to ignore file, most likely it was withdrawn') - parser = WithdrawnPaper() - if kind == 'tex': - print('parsing as TeX') - parser = TexParser(filename, file_obj, tarfile_obj) - elif kind == 'latex': - print('parsing as LaTeX') - parser = LatexParser(filename, file_obj, tarfile_obj) - else: - print('cannot determine kind of file') - - # attempt to parse the document - try: - if parser is not None: - parser.parse() - except Exception as e: - print('exception while trying to parse file:') - print(str(e)) - parser = None - - # close the files - file_obj.close() - if tarfile_obj is not None: - tarfile_obj.close() - - # return the parsed document - return parser - -arxiv_classes = [ - 'acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', - 'bayes-an', 'chao-dyn', 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat', - 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat', - 'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', - 'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', 'q-alg', - 'q-bio', 'quant-ph', 'solv-int', 'supr-con' -] - -def do_single_file(file_name, print_xml, write_xml_dir): - arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) - if arxiv_id is None: - print('WARN: could not determine arXiv identifier for', file_name) - arxiv_id = '<unknown>' - arxiv_version = 0 - - Log.reset() - Statistics.begin_item(arxiv_id) - - if file_has_suffix(file_name, '.pdf'): - Statistics.count('1) pdf') - succ = True - else: - Statistics.count('2) processed') - - parser = process_article(file_name) - - if parser is not None : - succ = parser['success'] - bib_refs = parser['refs'] - else : - succ = False - bib_refs = [] - - if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): - Statistics.count('hep-processed') - if succ: - Statistics.count('hep-success') - if succ: - print('-success--------') - Statistics.count('3) success') - else: - print('-fail-----------') - Statistics.count('4) fail') - - show_ref = False - - if succ and show_ref: - for bib_ref in bib_refs: - print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') - if len(bib_ref.bib_info) == 0: - print('no reference') - else: - print(bib_ref.bib_info_as_str(keep_comments=True)) - - if succ and (print_xml or write_xml_dir): - xml = Element('article') - SubElement(xml, 'id').text = arxiv_id - if arxiv_version > 0: - SubElement(xml, 'version').text = str(arxiv_version) - refs = SubElement(xml, 'refs') - for bib_ref in bib_refs: - bib_text = bib_ref.bib_info_as_str(keep_comments=True) - if len(bib_text) != 0: - ncites = bib_ref.cite_count - if ncites < 1: - ncites = 1 - ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) - ref.text = bib_text - if print_xml: - print(tostring(xml)) - if isinstance(write_xml_dir, str): - if arxiv_id != '<unknown>': - xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml') - else: - fname = os.path.split(file_name)[1] - if fname.rfind('.') > 0: - fname = fname[:fname.rfind('.')] - xml_file_name = write_xml_dir + '/' + fname + '.xml' - file_obj = open(xml_file_name, 'wb') - file_obj.write(tostring(xml, encoding='utf-8')) - file_obj.close() - - Statistics.end_item() - - return succ - -summaryStrs = [] - -if __name__ == "__main__": - cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.') - cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process') - cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout') - cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files') - cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files') - cmd_parser.add_argument('files', nargs='+', help='input files') - args = cmd_parser.parse_args() - - # print date stamp - timeStart = datetime.datetime.now() - print('[ptex] started processing at', str(timeStart)) - - print('given', len(args.files), 'files, first file:', args.files[0]) - print('================') - - Statistics.clear('article') - - # build list of files to process - file_list = buildFileList(args.filelist, args.files) - - # ensure the destination directory exists - if args.write_xml is not None and os.path.exists(args.write_xml): - try: - os.makedirs(args.write_xml) - except: - pass - - # process the files - failed_files = [] - for file_name in file_list: - success = do_single_file(file_name, args.print_xml, args.write_xml) - if not success: - failed_files.append(file_name) - - # write the failed files to an output file, if requested - if args.failed is not None: - file_obj = open(args.failed, 'w') - file_obj.writelines(f + '\n' for f in failed_files) - file_obj.close() - - print('================') - Statistics.show() - Statistics.show_detail('fail') - #Statistics.show_detail('cite-range') - #Statistics.show_detail('bad-ascii') - #Statistics.show_detail('non-ascii') - - print('================') - - # print date stamp - timeEnd = datetime.datetime.now() - print('[ptex] finished processing at', str(timeEnd)) - - # print summary for email - summaryStrs.extend(Statistics.get_summary()) - summaryStrs.insert(0, 'started processing at %s, took %.1f minutes' % (timeStart.strftime('%H:%M'), (timeEnd - timeStart).total_seconds() / 60)) - for s in summaryStrs: - print('**SUMMARY** [ptex]', s) |