summaryrefslogtreecommitdiffstatshomepage
path: root/tests/bytecode/mp-tests/ptex.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/bytecode/mp-tests/ptex.py')
-rw-r--r--tests/bytecode/mp-tests/ptex.py269
1 files changed, 269 insertions, 0 deletions
diff --git a/tests/bytecode/mp-tests/ptex.py b/tests/bytecode/mp-tests/ptex.py
new file mode 100644
index 0000000000..8f23d78009
--- /dev/null
+++ b/tests/bytecode/mp-tests/ptex.py
@@ -0,0 +1,269 @@
+import sys
+import os
+import os.path
+import datetime
+import argparse
+from xml.etree.ElementTree import Element, SubElement, tostring
+
+from log import Log
+from texparser import TexParser
+from latexparser import LatexParser
+from gettexfile import file_has_suffix
+from gettexfile import get_tex_file
+
+from xiwi.common.misc import buildFileList
+from xiwi.common import arxivid
+from xiwi.common.stats import Statistics
+
+def str_contains(s1, s2):
+ return s1.find(s2) != -1
+
+def str_contains_one_of(st, st_list):
+ for st2 in st_list:
+ if str_contains(st, st2):
+ return True
+ return False
+
+def detect_file_kind(file_obj):
+ """Simple detection of kind of source file."""
+ kind = 'unknown'
+ firstline = file_obj.readline()
+ while firstline.isspace():
+ firstline = file_obj.readline()
+ if firstline.startswith('%!PS'):
+ kind = 'PS'
+ elif firstline.startswith('%auto-ignore'):
+ kind = 'auto-ignore'
+ else:
+ file_obj.seek(0)
+ for line in file_obj:
+ if str_contains(line, '\\def'):
+ # might be tex, if we don't find anything else
+ kind = 'tex'
+ if str_contains(line, '\\input'):
+ # might be tex, if we don't find anything else
+ kind = 'tex'
+ if str_contains(line, 'amstex') or str_contains(line, 'harvmac'):
+ # definitely tex
+ kind = 'tex'
+ break
+ if str_contains(line, '\\documentclass'):
+ # definitely latex
+ kind = 'latex'
+ break
+ if str_contains(line, '\\documentstyle'):
+ # could be tex or latex
+ if str_contains(line, 'amsppt'):
+ kind = 'tex'
+ break
+ else:
+ kind = 'latex'
+ break
+ file_obj.seek(0)
+ return kind
+
+class WithdrawnPaper(object):
+ def __init__(self):
+ pass
+
+ def __getitem__(self, item):
+ if item == 'refs':
+ return []
+ elif item == 'success':
+ return True
+
+ def parse(self):
+ pass
+
+def process_article(filename):
+ """Returns TexParserBase derived object on success, None on failure."""
+
+ # get the tex file
+ filename, file_obj, tarfile_obj = get_tex_file(filename)
+ if file_obj is None:
+ return None
+
+ # detect the type of file
+ kind = detect_file_kind(file_obj)
+
+ # act on the type of file
+ parser = None
+ if kind == 'PS':
+ print('skipping postscript file')
+ elif kind == 'auto-ignore':
+ print('asked to ignore file, most likely it was withdrawn')
+ parser = WithdrawnPaper()
+ if kind == 'tex':
+ print('parsing as TeX')
+ parser = TexParser(filename, file_obj, tarfile_obj)
+ elif kind == 'latex':
+ print('parsing as LaTeX')
+ parser = LatexParser(filename, file_obj, tarfile_obj)
+ else:
+ print('cannot determine kind of file')
+
+ # attempt to parse the document
+ try:
+ if parser is not None:
+ parser.parse()
+ except Exception as e:
+ print('exception while trying to parse file:')
+ print(str(e))
+ parser = None
+
+ # close the files
+ file_obj.close()
+ if tarfile_obj is not None:
+ tarfile_obj.close()
+
+ # return the parsed document
+ return parser
+
+arxiv_classes = [
+ 'acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph',
+ 'bayes-an', 'chao-dyn', 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat',
+ 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat',
+ 'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin',
+ 'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', 'q-alg',
+ 'q-bio', 'quant-ph', 'solv-int', 'supr-con'
+]
+
+def do_single_file(file_name, print_xml, write_xml_dir):
+ arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
+ if arxiv_id is None:
+ print('WARN: could not determine arXiv identifier for', file_name)
+ arxiv_id = '<unknown>'
+ arxiv_version = 0
+
+ Log.reset()
+ Statistics.begin_item(arxiv_id)
+
+ if file_has_suffix(file_name, '.pdf'):
+ Statistics.count('1) pdf')
+ succ = True
+ else:
+ Statistics.count('2) processed')
+
+ parser = process_article(file_name)
+
+ if parser is not None :
+ succ = parser['success']
+ bib_refs = parser['refs']
+ else :
+ succ = False
+ bib_refs = []
+
+ if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
+ Statistics.count('hep-processed')
+ if succ:
+ Statistics.count('hep-success')
+ if succ:
+ print('-success--------')
+ Statistics.count('3) success')
+ else:
+ print('-fail-----------')
+ Statistics.count('4) fail')
+
+ show_ref = False
+
+ if succ and show_ref:
+ for bib_ref in bib_refs:
+ print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper')
+ if len(bib_ref.bib_info) == 0:
+ print('no reference')
+ else:
+ print(bib_ref.bib_info_as_str(keep_comments=True))
+
+ if succ and (print_xml or write_xml_dir):
+ xml = Element('article')
+ SubElement(xml, 'id').text = arxiv_id
+ if arxiv_version > 0:
+ SubElement(xml, 'version').text = str(arxiv_version)
+ refs = SubElement(xml, 'refs')
+ for bib_ref in bib_refs:
+ bib_text = bib_ref.bib_info_as_str(keep_comments=True)
+ if len(bib_text) != 0:
+ ncites = bib_ref.cite_count
+ if ncites < 1:
+ ncites = 1
+ ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites))
+ ref.text = bib_text
+ if print_xml:
+ print(tostring(xml))
+ if isinstance(write_xml_dir, str):
+ if arxiv_id != '<unknown>':
+ xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml')
+ else:
+ fname = os.path.split(file_name)[1]
+ if fname.rfind('.') > 0:
+ fname = fname[:fname.rfind('.')]
+ xml_file_name = write_xml_dir + '/' + fname + '.xml'
+ file_obj = open(xml_file_name, 'wb')
+ file_obj.write(tostring(xml, encoding='utf-8'))
+ file_obj.close()
+
+ Statistics.end_item()
+
+ return succ
+
+summaryStrs = []
+
+if __name__ == "__main__":
+ cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.')
+ cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process')
+ cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout')
+ cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files')
+ cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files')
+ cmd_parser.add_argument('files', nargs='+', help='input files')
+ args = cmd_parser.parse_args()
+
+ # print date stamp
+ timeStart = datetime.datetime.now()
+ print('[ptex] started processing at', str(timeStart))
+
+ print('given', len(args.files), 'files, first file:', args.files[0])
+ print('================')
+
+ Statistics.clear('article')
+
+ # build list of files to process
+ file_list = buildFileList(args.filelist, args.files)
+
+ # ensure the destination directory exists
+ if args.write_xml is not None and os.path.exists(args.write_xml):
+ try:
+ os.makedirs(args.write_xml)
+ except:
+ pass
+
+ # process the files
+ failed_files = []
+ for file_name in file_list:
+ success = do_single_file(file_name, args.print_xml, args.write_xml)
+ if not success:
+ failed_files.append(file_name)
+
+ # write the failed files to an output file, if requested
+ if args.failed is not None:
+ file_obj = open(args.failed, 'w')
+ file_obj.writelines(f + '\n' for f in failed_files)
+ file_obj.close()
+
+ print('================')
+ Statistics.show()
+ Statistics.show_detail('fail')
+ #Statistics.show_detail('cite-range')
+ #Statistics.show_detail('bad-ascii')
+ #Statistics.show_detail('non-ascii')
+
+ print('================')
+
+ # print date stamp
+ timeEnd = datetime.datetime.now()
+ print('[ptex] finished processing at', str(timeEnd))
+
+ # print summary for email
+ summaryStrs.extend(Statistics.get_summary())
+ summaryStrs.insert(0, 'started processing at %s, took %.1f minutes' % (timeStart.strftime('%H:%M'), (timeEnd - timeStart).total_seconds() / 60))
+ for s in summaryStrs:
+ print('**SUMMARY** [ptex]', s)