Snakefile 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. # -*- coding: utf-8; -*-
  2. import locale
  3. import os.path
  4. import regex
  5. import urllib.parse
  6. import bibtexparser
  7. from collections.abc import Iterable, Mapping
  8. from distutils.spawn import find_executable
  9. from fnmatch import fnmatch
  10. from subprocess import check_output, check_call
  11. from tempfile import NamedTemporaryFile
  12. from bibtexparser.bibdatabase import BibDatabase
  13. from bibtexparser.bparser import BibTexParser
  14. try:
  15. from os import scandir, walk
  16. except ImportError:
  17. from scandir import scandir, walk
  18. def unnest(*args):
  19. '''Un-nest list- and tuple-like elements in arguments.
  20. "List-like" means anything with a len() and whose elments can be
  21. accessed with numeric indexing, except for string-like elements. It
  22. must also be an instance of the collections.Iterable abstract class.
  23. Dict-like elements and iterators/generators are not affected.
  24. This function always returns a list, even if it is passed a single
  25. scalar argument.
  26. '''
  27. result = []
  28. for arg in args:
  29. if isinstance(arg, str):
  30. # String
  31. result.append(arg)
  32. elif isinstance(arg, Mapping):
  33. # Dict-like
  34. result.append(arg)
  35. elif isinstance(arg, Iterable):
  36. try:
  37. # Duck-typing test for list-ness (a stricter condition
  38. # than just "iterable")
  39. for i in range(len(arg)):
  40. result.append(arg[i])
  41. except TypeError:
  42. # Iterable but not list-like
  43. result.append(arg)
  44. else:
  45. # Not iterable
  46. result.append(arg)
  47. return result
  48. def check_output_decode(*args, encoding=locale.getpreferredencoding(), **kwargs):
  49. '''Shortcut for check.output + str.decode'''
  50. return check_output(*args, **kwargs).decode(encoding)
  51. def find_mac_app(name):
  52. try:
  53. result = check_output_decode(
  54. ['mdfind',
  55. 'kMDItemDisplayName=={name}.app&&kMDItemKind==Application'.format(name=name)]).split('\n')[0]
  56. if not result:
  57. raise Exception("No result found")
  58. return result
  59. except Exception:
  60. return None
  61. def glob_recursive(pattern, top='.', include_hidden=False, *args, **kwargs):
  62. '''Combination of glob.glob and os.walk.
  63. Reutrns the relative path to every file or directory matching the
  64. pattern anywhere in the specified directory hierarchy. Defaults to the
  65. current working directory. Any additional arguments are passed to
  66. os.walk.'''
  67. for (path, dirs, files) in walk(top, *args, **kwargs):
  68. for f in dirs + files:
  69. if include_hidden or f.startswith('.'):
  70. continue
  71. if fnmatch(f, pattern):
  72. yield os.path.normpath(os.path.join(path, f))
  73. LYXPATH = find_executable('lyx') or \
  74. os.path.join(find_mac_app('LyX'), 'Contents/MacOS/lyx') or \
  75. '/bin/false'
  76. def rsync_list_files(*paths, extra_rsync_args=(), include_dirs=False):
  77. '''Iterate over the files in path that rsync would copy.
  78. By default, only files are listed, not directories, since doit doesn't
  79. like dependencies on directories because it can't hash them.
  80. This uses "rsync --list-only" to make rsync directly indicate which
  81. files it would copy, so any exclusion/inclusion rules are taken into
  82. account.
  83. '''
  84. rsync_list_cmd = [ 'rsync', '-r', '--list-only' ] + unnest(extra_rsync_args) + unnest(paths) + [ '.' ]
  85. rsync_out = check_output_decode(rsync_list_cmd).splitlines()
  86. for line in rsync_out:
  87. s = regex.search('^(-|d)(?:\S+\s+){4}(.*)', line)
  88. if s is not None:
  89. if include_dirs or s.group(1) == '-':
  90. yield s.group(2)
  91. def lyx_bib_deps(lyxfile):
  92. '''Return an iterator over all bib files referenced by a Lyx file.
  93. This will only return the names of existing files, so it will be
  94. unreliable in the case of an auto-generated bib file.
  95. '''
  96. try:
  97. with open(lyxfile) as f:
  98. lyx_text = f.read()
  99. bib_names = []
  100. for m in regex.finditer('bibfiles "(.*?)"', lyx_text):
  101. bib_names.extend(m.group(1).split(','))
  102. # Unfortunately LyX doesn't indicate which bib names refer to
  103. # files in the current directory and which don't. Currently that's
  104. # not a problem for me since all my refs are in bib files in the
  105. # current directory.
  106. for bn in bib_names:
  107. bib_path = bn + '.bib'
  108. yield bib_path
  109. except FileNotFoundError:
  110. pass
  111. def lyx_hrefs(lyxfile):
  112. '''Return an iterator over hrefs in a LyX file.'''
  113. pattern = '''
  114. (?xsm)
  115. ^ LatexCommand \\s+ href \\s* \\n
  116. (?: name \\b [^\\n]+ \\n )?
  117. target \\s+ "(.*?)" $
  118. '''
  119. with open(lyxfile) as f:
  120. return (urllib.parse.unquote(m.group(1)) for m in
  121. re.finditer(pattern, f.read()))
  122. examples_base_url = 'https://darwinawardwinner.github.io/resume/examples/'
  123. examples_dir = 'examples'
  124. def resume_example_deps(lyxfile):
  125. '''Iterate over all referenced example files in a LyX file.'''
  126. for href in lyx_hrefs(lyxfile):
  127. if href.startswith(examples_base_url) and not href.endswith('/'):
  128. expath = href[len(examples_base_url):]
  129. yield os.path.join(examples_dir, expath)
  130. readme_files = list(glob_recursive('README.mkdn', top='examples'))
  131. index_files = [ os.path.join(os.path.dirname(f), 'index.html') for f in readme_files ]
  132. rsync_common_args = ['-rL', '--size-only', '--delete', '--exclude', '.DS_Store', '--delete-excluded',]
  133. all_example_files = set(rsync_list_files('examples', extra_rsync_args=rsync_common_args))
  134. r_html_files = [ f + '.html' for f in all_example_files if f.endswith('.R') ]
  135. all_example_files = all_example_files.union(index_files)
  136. all_example_files = all_example_files.union(r_html_files)
  137. rule build_all:
  138. input: 'ryan_thompson_resume.pdf', #'ryan_thompson_resume.html',
  139. 'ryan_thompson_cv.pdf', #'ryan_thompson_cv.html',
  140. #'index.html',
  141. index_files, r_html_files
  142. rule create_resume_pdf:
  143. input: lyxfile='ryan_thompson_resume.lyx',
  144. bibfiles=list(lyx_bib_deps('ryan_thompson_resume.lyx')),
  145. example_files=list(resume_example_deps('ryan_thompson_resume.lyx')),
  146. headshot='headshot-crop.png',
  147. output: pdf='ryan_thompson_resume.pdf'
  148. shell: '{LYXPATH:q} --export-to pdf4 {output.pdf:q} {input.lyxfile:q}'
  149. # rule create_resume_html:
  150. # input: lyxfile='ryan_thompson_resume.lyx',
  151. # bibfiles=list(lyx_bib_deps('ryan_thompson_resume.lyx')),
  152. # example_files=list(resume_example_deps('ryan_thompson_resume.lyx')),
  153. # headshot='headshot-crop.png',
  154. # output: html='ryan_thompson_resume.html'
  155. # run:
  156. # with NamedTemporaryFile() as tempf:
  157. # shell('{LYXPATH:q} --export-to xhtml {tempf.name:q} {input.lyxfile:q}')
  158. # shell('''cat {tempf.name:q} | perl -lape 's[<span class="flex_cv_image">(.*?)</span>][<span class="flex_cv_image"><img src="$1" width="100"></span>]g' > {output.html:q}''')
  159. rule create_cv_pdf:
  160. input: lyxfile='ryan_thompson_cv.lyx',
  161. bibfiles=list(lyx_bib_deps('ryan_thompson_cv.lyx')),
  162. example_files=list(resume_example_deps('ryan_thompson_cv.lyx')),
  163. headshot='headshot-crop.png',
  164. output: pdf='ryan_thompson_cv.pdf'
  165. shell: '{LYXPATH:q} --export-to pdf4 {output.pdf:q} {input.lyxfile:q}'
  166. # rule create_cv_html:
  167. # input: lyxfile='ryan_thompson_cv.lyx',
  168. # bibfiles=list(lyx_bib_deps('ryan_thompson_cv.lyx')),
  169. # example_files=list(resume_example_deps('ryan_thompson_cv.lyx')),
  170. # headshot='headshot-crop.png',
  171. # output: html='ryan_thompson_cv.html'
  172. # run:
  173. # with NamedTemporaryFile() as tempf:
  174. # shell('{LYXPATH:q} --export-to xhtml {tempf.name:q} {input.lyxfile:q}')
  175. # shell('''cat {tempf.name:q} | perl -lape 's[<span class="flex_cv_image">(.*?)</span>][<span class="flex_cv_image"><img src="$1" width="100"></span>]g' > {output.html:q}''')
  176. # rule link_resume_to_index_html:
  177. # input: 'ryan_thompson_resume.html'
  178. # output: 'index.html'
  179. # shell: 'ln -s {input:q} {output:q}'
  180. rule examples_readme_to_index_html:
  181. input: '{dirname}README.mkdn'
  182. output: '{dirname,examples(/.*)?/}index.html'
  183. shell: 'pandoc -t html -o {output[0]:q} {input[0]:q}'
  184. rule R_to_html:
  185. input: '{dirname}/{basename,[^/]+}.R'
  186. output: '{dirname}/{basename}.R.html'
  187. shell: 'pygmentize -f html -O full -l R -o {output:q} {input:q}'
  188. rule process_bib:
  189. '''Preprocess bib file for LaTeX.
  190. For entries with a DOI, all URLs are stripped, since the DOI already
  191. provides a clickable link. For entries with no DOI, all but one URL is
  192. discarded, since LyX can't handle entries with multiple URLs. The
  193. shortest URL is kept.'''
  194. input: '{basename}.bib'
  195. output: '{basename,.*(?<!-PROCESSED)}-PROCESSED.bib'
  196. run:
  197. with open(input[0]) as infile:
  198. parser = BibTexParser(
  199. ignore_nonstandard_types = False,
  200. common_strings = True,
  201. )
  202. bib_db = bibtexparser.load(infile, parser)
  203. entries = bib_db.entries
  204. for entry in entries:
  205. # Keep DOI or exactly one URL
  206. if 'doi' in entry:
  207. try:
  208. del entry['url']
  209. except KeyError:
  210. pass
  211. else:
  212. try:
  213. entry_urls = regex.split('\\s+', entry['url'])
  214. shortest_url = min(entry_urls, key=len)
  215. # Need to fix e.g. 'http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=55329{\\&}tool=pmcentrez{\\&}rendertype=abstract'
  216. shortest_url = re.sub('\\{\\\\?(.)\\}', '\\1', shortest_url)
  217. entry['url'] = shortest_url
  218. except KeyError:
  219. pass
  220. # Delete PMID because it doesn't make a functional link
  221. try:
  222. if entry['eprinttype'] == 'pmid':
  223. del entry['eprinttype']
  224. del entry['eprint']
  225. except KeyError:
  226. pass
  227. # Boldface my name
  228. authors = regex.split("\\s+and\\s+",entry['author'])
  229. for i in range(len(authors)):
  230. m = regex.search('^Thompson,\\s+(R.*)$', authors[i])
  231. if m:
  232. authors[i] = f"\\textbf{{{m.group(1)} Thompson}}"
  233. entry['author'] = ' and '.join(authors)
  234. # Fix CD4+ formatting (superscript plus sign)
  235. entry['title'] = regex.sub('CD4\\+', 'CD4$^{+}$', entry['title'])
  236. new_db = BibDatabase()
  237. new_db.entries = entries
  238. with open(output[0], 'w') as outfile:
  239. bibtexparser.dump(new_db, outfile)