diff options
Diffstat (limited to 'tests/bytecode/mp-tests/ptex.py')
-rw-r--r-- | tests/bytecode/mp-tests/ptex.py | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/tests/bytecode/mp-tests/ptex.py b/tests/bytecode/mp-tests/ptex.py new file mode 100644 index 0000000000..8f23d78009 --- /dev/null +++ b/tests/bytecode/mp-tests/ptex.py @@ -0,0 +1,269 @@ +import sys +import os +import os.path +import datetime +import argparse +from xml.etree.ElementTree import Element, SubElement, tostring + +from log import Log +from texparser import TexParser +from latexparser import LatexParser +from gettexfile import file_has_suffix +from gettexfile import get_tex_file + +from xiwi.common.misc import buildFileList +from xiwi.common import arxivid +from xiwi.common.stats import Statistics + +def str_contains(s1, s2): + return s1.find(s2) != -1 + +def str_contains_one_of(st, st_list): + for st2 in st_list: + if str_contains(st, st2): + return True + return False + +def detect_file_kind(file_obj): + """Simple detection of kind of source file.""" + kind = 'unknown' + firstline = file_obj.readline() + while firstline.isspace(): + firstline = file_obj.readline() + if firstline.startswith('%!PS'): + kind = 'PS' + elif firstline.startswith('%auto-ignore'): + kind = 'auto-ignore' + else: + file_obj.seek(0) + for line in file_obj: + if str_contains(line, '\\def'): + # might be tex, if we don't find anything else + kind = 'tex' + if str_contains(line, '\\input'): + # might be tex, if we don't find anything else + kind = 'tex' + if str_contains(line, 'amstex') or str_contains(line, 'harvmac'): + # definitely tex + kind = 'tex' + break + if str_contains(line, '\\documentclass'): + # definitely latex + kind = 'latex' + break + if str_contains(line, '\\documentstyle'): + # could be tex or latex + if str_contains(line, 'amsppt'): + kind = 'tex' + break + else: + kind = 'latex' + break + file_obj.seek(0) + return kind + +class WithdrawnPaper(object): + def __init__(self): + pass + + def __getitem__(self, item): + if item == 'refs': + return [] + elif item == 'success': + return True + + def parse(self): + pass + +def process_article(filename): + """Returns TexParserBase derived object on success, None on failure.""" + + # get the tex file + filename, file_obj, tarfile_obj = get_tex_file(filename) + if file_obj is None: + return None + + # detect the type of file + kind = detect_file_kind(file_obj) + + # act on the type of file + parser = None + if kind == 'PS': + print('skipping postscript file') + elif kind == 'auto-ignore': + print('asked to ignore file, most likely it was withdrawn') + parser = WithdrawnPaper() + if kind == 'tex': + print('parsing as TeX') + parser = TexParser(filename, file_obj, tarfile_obj) + elif kind == 'latex': + print('parsing as LaTeX') + parser = LatexParser(filename, file_obj, tarfile_obj) + else: + print('cannot determine kind of file') + + # attempt to parse the document + try: + if parser is not None: + parser.parse() + except Exception as e: + print('exception while trying to parse file:') + print(str(e)) + parser = None + + # close the files + file_obj.close() + if tarfile_obj is not None: + tarfile_obj.close() + + # return the parsed document + return parser + +arxiv_classes = [ + 'acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', + 'bayes-an', 'chao-dyn', 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat', + 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat', + 'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', + 'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', 'q-alg', + 'q-bio', 'quant-ph', 'solv-int', 'supr-con' +] + +def do_single_file(file_name, print_xml, write_xml_dir): + arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) + if arxiv_id is None: + print('WARN: could not determine arXiv identifier for', file_name) + arxiv_id = '<unknown>' + arxiv_version = 0 + + Log.reset() + Statistics.begin_item(arxiv_id) + + if file_has_suffix(file_name, '.pdf'): + Statistics.count('1) pdf') + succ = True + else: + Statistics.count('2) processed') + + parser = process_article(file_name) + + if parser is not None : + succ = parser['success'] + bib_refs = parser['refs'] + else : + succ = False + bib_refs = [] + + if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): + Statistics.count('hep-processed') + if succ: + Statistics.count('hep-success') + if succ: + print('-success--------') + Statistics.count('3) success') + else: + print('-fail-----------') + Statistics.count('4) fail') + + show_ref = False + + if succ and show_ref: + for bib_ref in bib_refs: + print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') + if len(bib_ref.bib_info) == 0: + print('no reference') + else: + print(bib_ref.bib_info_as_str(keep_comments=True)) + + if succ and (print_xml or write_xml_dir): + xml = Element('article') + SubElement(xml, 'id').text = arxiv_id + if arxiv_version > 0: + SubElement(xml, 'version').text = str(arxiv_version) + refs = SubElement(xml, 'refs') + for bib_ref in bib_refs: + bib_text = bib_ref.bib_info_as_str(keep_comments=True) + if len(bib_text) != 0: + ncites = bib_ref.cite_count + if ncites < 1: + ncites = 1 + ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) + ref.text = bib_text + if print_xml: + print(tostring(xml)) + if isinstance(write_xml_dir, str): + if arxiv_id != '<unknown>': + xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml') + else: + fname = os.path.split(file_name)[1] + if fname.rfind('.') > 0: + fname = fname[:fname.rfind('.')] + xml_file_name = write_xml_dir + '/' + fname + '.xml' + file_obj = open(xml_file_name, 'wb') + file_obj.write(tostring(xml, encoding='utf-8')) + file_obj.close() + + Statistics.end_item() + + return succ + +summaryStrs = [] + +if __name__ == "__main__": + cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.') + cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process') + cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout') + cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files') + cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files') + cmd_parser.add_argument('files', nargs='+', help='input files') + args = cmd_parser.parse_args() + + # print date stamp + timeStart = datetime.datetime.now() + print('[ptex] started processing at', str(timeStart)) + + print('given', len(args.files), 'files, first file:', args.files[0]) + print('================') + + Statistics.clear('article') + + # build list of files to process + file_list = buildFileList(args.filelist, args.files) + + # ensure the destination directory exists + if args.write_xml is not None and os.path.exists(args.write_xml): + try: + os.makedirs(args.write_xml) + except: + pass + + # process the files + failed_files = [] + for file_name in file_list: + success = do_single_file(file_name, args.print_xml, args.write_xml) + if not success: + failed_files.append(file_name) + + # write the failed files to an output file, if requested + if args.failed is not None: + file_obj = open(args.failed, 'w') + file_obj.writelines(f + '\n' for f in failed_files) + file_obj.close() + + print('================') + Statistics.show() + Statistics.show_detail('fail') + #Statistics.show_detail('cite-range') + #Statistics.show_detail('bad-ascii') + #Statistics.show_detail('non-ascii') + + print('================') + + # print date stamp + timeEnd = datetime.datetime.now() + print('[ptex] finished processing at', str(timeEnd)) + + # print summary for email + summaryStrs.extend(Statistics.get_summary()) + summaryStrs.insert(0, 'started processing at %s, took %.1f minutes' % (timeStart.strftime('%H:%M'), (timeEnd - timeStart).total_seconds() / 60)) + for s in summaryStrs: + print('**SUMMARY** [ptex]', s) |