Snakefile 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. # -*- coding: utf-8; -*-
  2. import locale
  3. import os
  4. import os.path
  5. import regex
  6. import urllib.parse
  7. import os.path
  8. import bibtexparser
  9. import pypandoc
  10. from collections.abc import Iterable, Mapping
  11. from distutils.spawn import find_executable
  12. from fnmatch import fnmatch
  13. from subprocess import check_output, check_call
  14. from tempfile import NamedTemporaryFile
  15. from bibtexparser.bibdatabase import BibDatabase
  16. from bibtexparser.bparser import BibTexParser
  17. from lxml import html
  18. from snakemake.utils import min_version
  19. min_version('3.7.1')
  20. try:
  21. from os import scandir, walk
  22. except ImportError:
  23. from scandir import scandir, walk
  24. def unnest(*args):
  25. '''Un-nest list- and tuple-like elements in arguments.
  26. "List-like" means anything with a len() and whose elments can be
  27. accessed with numeric indexing, except for string-like elements. It
  28. must also be an instance of the collections.Iterable abstract class.
  29. Dict-like elements and iterators/generators are not affected.
  30. This function always returns a list, even if it is passed a single
  31. scalar argument.
  32. '''
  33. result = []
  34. for arg in args:
  35. if isinstance(arg, str):
  36. # String
  37. result.append(arg)
  38. elif isinstance(arg, Mapping):
  39. # Dict-like
  40. result.append(arg)
  41. elif isinstance(arg, Iterable):
  42. try:
  43. # Duck-typing test for list-ness (a stricter condition
  44. # than just "iterable")
  45. for i in range(len(arg)):
  46. result.append(arg[i])
  47. except TypeError:
  48. # Iterable but not list-like
  49. result.append(arg)
  50. else:
  51. # Not iterable
  52. result.append(arg)
  53. return result
  54. def check_output_decode(*args, encoding=locale.getpreferredencoding(), **kwargs):
  55. '''Shortcut for check.output + str.decode'''
  56. return check_output(*args, **kwargs).decode(encoding)
  57. def find_mac_app(name):
  58. try:
  59. result = \
  60. check_output_decode(
  61. ['mdfind',
  62. 'kMDItemDisplayName=="{name}.app"c&&kMDItemKind==Application'.format(name=name)]).split('\n')[0] or \
  63. check_output_decode(
  64. ['mdfind',
  65. 'kMDItemDisplayName=="{name}"c&&kMDItemKind==Application'.format(name=name)]).split('\n')[0]
  66. if result:
  67. return result
  68. else:
  69. raise Exception("Not found")
  70. except Exception:
  71. return None
  72. def find_executable_on_mac(executable, path=None, app_name=None, app_paths=None):
  73. # Easy case
  74. found_executable = find_executable(executable, path)
  75. if found_executable:
  76. return found_executable
  77. # Ok, now we search
  78. if app_paths is None:
  79. app_paths = []
  80. if app_name is None:
  81. app_name = executable
  82. found_app_path = find_mac_app(app_name)
  83. if found_app_path:
  84. app_paths.append(found_app_path)
  85. if app_paths:
  86. new_search_path = ":".join(os.path.join(p, 'Contents/MacOS') for p in app_paths)
  87. return find_executable(executable, path=new_search_path)
  88. else:
  89. return None
  90. # Fallback to /bin/false to trigger an error when run (we don't want
  91. # to trigger an error now, while building the rules)
  92. LYX_PATH = find_executable_on_mac('lyx') or '/bin/false'
  93. # GIMP_PATH = find_executable_on_mac('gimp', app_name='gimp*') or '/bin/false'
  94. PDFINFO_PATH = find_executable('pdfinfo')
  95. def print_pdfinfo(filename):
  96. if PDFINFO_PATH:
  97. shell('''{PDFINFO_PATH} {filename:q}''')
  98. def glob_recursive(pattern, top='.', include_hidden=False, *args, **kwargs):
  99. '''Combination of glob.glob and os.walk.
  100. Reutrns the relative path to every file or directory matching the
  101. pattern anywhere in the specified directory hierarchy. Defaults to the
  102. current working directory. Any additional arguments are passed to
  103. os.walk.'''
  104. for (path, dirs, files) in walk(top, *args, **kwargs):
  105. for f in dirs + files:
  106. if include_hidden or f.startswith('.'):
  107. continue
  108. if fnmatch(f, pattern):
  109. yield os.path.normpath(os.path.join(path, f))
  110. def rsync_list_files(*paths, extra_rsync_args=(), include_dirs=False):
  111. '''Iterate over the files in path that rsync would copy.
  112. By default, only files are listed, not directories, since doit doesn't
  113. like dependencies on directories because it can't hash them.
  114. This uses "rsync --list-only" to make rsync directly indicate which
  115. files it would copy, so any exclusion/inclusion rules are taken into
  116. account.
  117. '''
  118. rsync_list_cmd = [ 'rsync', '-r', '--list-only' ] + unnest(extra_rsync_args) + unnest(paths) + [ '.' ]
  119. rsync_out = check_output_decode(rsync_list_cmd).splitlines()
  120. for line in rsync_out:
  121. s = regex.search('^(-|d)(?:\S+\s+){4}(.*)', line)
  122. if s is not None:
  123. if include_dirs or s.group(1) == '-':
  124. yield s.group(2)
  125. def lyx_input_deps(lyxfile):
  126. '''Return an iterator over all tex files included by a Lyx file.'''
  127. try:
  128. with open(lyxfile) as f:
  129. lyx_text = f.read()
  130. for m in regex.finditer('\\\\(?:input|loadglsentries){(.*?[.]tex)}', lyx_text):
  131. yield m.group(1)
  132. except FileNotFoundError:
  133. pass
  134. def lyx_bib_deps(lyxfile):
  135. '''Return an iterator over all bib files referenced by a Lyx file.
  136. This will only return the names of existing files, so it will be
  137. unreliable in the case of an auto-generated bib file.
  138. '''
  139. try:
  140. with open(lyxfile) as f:
  141. lyx_text = f.read()
  142. bib_names = regex.search('bibfiles "(.*?)"', lyx_text).group(1).split(',')
  143. # Unfortunately LyX doesn't indicate which bib names refer to
  144. # files in the current directory and which don't. Currently that's
  145. # not a problem for me since all my refs are in bib files in the
  146. # current directory.
  147. for bn in bib_names:
  148. bib_path = bn + '.bib'
  149. yield bib_path
  150. except FileNotFoundError:
  151. pass
  152. def lyx_gfx_deps(lyxfile):
  153. '''Return an iterator over all graphics files included by a LyX file.'''
  154. try:
  155. with open(lyxfile) as f:
  156. lyx_text = f.read()
  157. for m in regex.finditer('\\\\begin_inset Graphics\\s+filename (.*?)$', lyx_text, regex.MULTILINE):
  158. yield m.group(1)
  159. except FileNotFoundError:
  160. pass
  161. def lyx_hrefs(lyxfile):
  162. '''Return an iterator over hrefs in a LyX file.'''
  163. try:
  164. pattern = '''
  165. (?xsm)
  166. ^ LatexCommand \\s+ href \\s* \\n
  167. (?: name \\b [^\\n]+ \\n )?
  168. target \\s+ "(.*?)" $
  169. '''
  170. with open(lyxfile) as f:
  171. return (urllib.parse.unquote(m.group(1)) for m in
  172. re.finditer(pattern, f.read()))
  173. except FileNotFoundError:
  174. pass
  175. def tex_gfx_extensions(tex_format = 'xetex'):
  176. '''Return the ordered list of graphics extensions.
  177. This yields the list of extensions that TeX will try for an
  178. \\includegraphics path.
  179. '''
  180. try:
  181. cmdout = check_output_decode(['texdef', '-t', tex_format, '-p', 'graphicx', 'Gin@extensions'])
  182. m = regex.search('^macro:->(.*?)$', cmdout, regex.MULTILINE)
  183. return m.group(1).split(',')
  184. except FileNotFoundError:
  185. return ()
  186. def get_latex_included_gfx(fname):
  187. '''Return list of all graphics included from '''
  188. try:
  189. with open(fname) as infile:
  190. beamer_latex = infile.read()
  191. # Remove comments
  192. beamer_latex = regex.sub('^%.*$','', beamer_latex)
  193. # Find graphics included
  194. return [ m.group(1) for m in regex.finditer(r'\includegraphics(?:\[.*?\])?\{(.+?)\}', beamer_latex) ]
  195. except FileNotFoundError:
  196. return ()
  197. rsync_common_args = ['-rL', '--size-only', '--delete', '--exclude', '.DS_Store', '--delete-excluded',]
  198. rule build_all:
  199. input: 'thesis.pdf', 'thesis-final.pdf', 'presentation.pdf'
  200. # Note: Any rule that generates an input LyX file for this rule must
  201. # be marked as a checkpoint. See
  202. # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution
  203. rule thesis_lyx_to_pdf:
  204. '''Produce PDF output for a LyX file.'''
  205. input: lyxfile = '{basename}.lyx',
  206. gfx_deps = lambda wildcards: lyx_gfx_deps(wildcards.basename + '.lyx'),
  207. bib_deps = lambda wildcards: lyx_bib_deps(wildcards.basename + '.lyx'),
  208. tex_deps = lambda wildcards: lyx_input_deps(wildcards.basename + '.lyx'),
  209. output: pdf='{basename,thesis.*}.pdf'
  210. run:
  211. if not LYX_PATH or LYX_PATH == '/bin/false':
  212. raise Exception('Path to LyX executable could not be found.')
  213. shell('''{LYX_PATH:q} -batch --verbose --export-to pdf4 {output.pdf:q} {input.lyxfile:q}''')
  214. print_pdfinfo(output.pdf)
  215. checkpoint lyx_add_final:
  216. '''Copy LyX file and add final option.'''
  217. input: lyxfile = '{basename}.lyx'
  218. # Ensure we can't get file-final-final-final-final.lyx
  219. output: lyxtemp = temp('{basename,(?!graphics/).*(?<!-final)}-final.lyx')
  220. run:
  221. with open(input.lyxfile, 'r') as infile, \
  222. open(output.lyxtemp, 'w') as outfile:
  223. lyx_text = infile.read()
  224. if not regex.search('\\\\options final', lyx_text):
  225. lyx_text = regex.sub('\\\\use_default_options true', '\\\\options final\n\\\\use_default_options true', lyx_text)
  226. outfile.write(lyx_text)
  227. rule process_bib:
  228. '''Preprocess bib file for LaTeX.
  229. For entries with a DOI, all URLs are stripped, since the DOI already
  230. provides a clickable link. For entries with no DOI, all but one URL is
  231. discarded, since LyX can't handle entries with multiple URLs. The
  232. shortest URL is kept.'''
  233. input: '{basename}.bib'
  234. output: '{basename,.*(?<!-PROCESSED)}-PROCESSED.bib'
  235. run:
  236. with open(input[0]) as infile:
  237. parser = BibTexParser(
  238. ignore_nonstandard_types = False,
  239. )
  240. bib_db = bibtexparser.load(infile, parser)
  241. entries = bib_db.entries
  242. for entry in entries:
  243. if 'doi' in entry:
  244. try:
  245. del entry['url']
  246. except KeyError:
  247. pass
  248. else:
  249. try:
  250. entry_urls = regex.split('\\s+', entry['url'])
  251. shortest_url = min(entry_urls, key=len)
  252. entry['url'] = shortest_url
  253. except KeyError:
  254. pass
  255. new_db = BibDatabase()
  256. new_db.entries = entries
  257. with open(output[0], 'w') as outfile:
  258. bibtexparser.dump(new_db, outfile)
  259. rule pdf_extract_page:
  260. '''Extract a single page from a multi-page PDF.'''
  261. # Input is a PDF whose basename doesn't already have a page number
  262. input: pdf = 'graphics/{basename}.pdf'
  263. output: pdf = 'graphics/{basename}-PAGE{pagenum,[1-9][0-9]*}.pdf'
  264. run:
  265. # This could be done with a regex constraint on basename,
  266. # except that variable width lookbehind isn't supported.
  267. # Unfortunately, that makes this a runtime error instead of an
  268. # error during DAG construction.
  269. if regex.search('-PAGE[0-9]+$', wildcards.basename):
  270. raise ValueError("Can't extract page from extracted page PDF.")
  271. shell('pdfseparate -f {wildcards.pagenum:q} -l {wildcards.pagenum:q} {input:q} {output:q}')
  272. rule pdf_crop:
  273. '''Crop away empty margins from a PDF.'''
  274. input: pdf = 'graphics/{basename}.pdf'
  275. output: pdf = 'graphics/{basename,.*(?<!-CROP)}-CROP.pdf'
  276. shell: 'pdfcrop --resolution 300 {input:q} {output:q}'
  277. rule pdf_raster:
  278. '''Rasterize PDF to PNG at 600 PPI.
  279. The largest dimension is scaled '''
  280. input: pdf = 'graphics/{basename}.pdf'
  281. output: png = 'graphics/{basename}-RASTER.png'
  282. shell: 'pdftoppm -singlefile -r 600 {input:q} | convert - {output:q}'
  283. rule pdf_raster_res:
  284. '''Rasterize PDF to PNG at specific PPI.
  285. The largest dimension is scaled '''
  286. input: pdf = 'graphics/{basename}.pdf'
  287. output: png = 'graphics/{basename}-RASTER{res,[1-9][0-9]+}.png'
  288. shell: 'pdftoppm -singlefile -r {wildcards.res} {input:q} | convert - {output:q}'
  289. rule png_crop:
  290. '''Crop away empty margins from a PNG.'''
  291. input: pdf = 'graphics/{basename}.png'
  292. output: pdf = 'graphics/{basename,.*(?<!-CROP)}-CROP.png'
  293. shell: 'convert {input:q} -trim {output:q}'
  294. rule jpg_crop:
  295. '''Crop away empty margins from a JPG.'''
  296. input: pdf = 'graphics/{basename}.jpg'
  297. output: pdf = 'graphics/{basename,.*(?<!-CROP)}-CROP.jpg'
  298. shell: 'convert {input:q} -trim {output:q}'
  299. rule svg_to_pdf:
  300. input: 'graphics/{filename}.svg'
  301. output: 'graphics/{filename}-SVG.pdf'
  302. run:
  303. infile = os.path.join(os.path.abspath("."), input[0])
  304. outfile = os.path.join(os.path.abspath("."), output[0])
  305. shell('''inkscape {infile:q} --export-pdf={outfile:q} --export-dpi=300''')
  306. rule svg_raster:
  307. input: 'graphics/{filename}.svg'
  308. output: 'graphics/{filename}-SVG.png'
  309. run:
  310. infile = os.path.join(os.path.abspath("."), input[0])
  311. outfile = os.path.join(os.path.abspath("."), output[0])
  312. shell('''inkscape {infile:q} --export-png={outfile:q} --export-dpi=300''')
  313. rule png_rotate:
  314. input: 'graphics/{filename}.png'
  315. output: 'graphics/{filename}-ROT{angle,[1-9][0-9]*}.png'
  316. run:
  317. if re.search('-ROT[1-9][0-9]*$', wildcards.filename):
  318. raise ValueError("Cannot double-rotate")
  319. shell('convert {input:q} -rotate {wildcards.angle:q} {output:q}')
  320. # rule xcf_to_png:
  321. # input: 'graphics/{filename}.xcf'
  322. # output: 'graphics/{filename}.png'
  323. # shell: 'convert {input:q} -flatten {output:q}'
  324. rule R_to_html:
  325. '''Render an R script as syntax-hilighted HTML.'''
  326. input: '{dirname}/{basename}.R'
  327. output: '{dirname}/{basename,[^/]+}.R.html'
  328. shell: 'pygmentize -f html -O full -l R -o {output:q} {input:q}'
  329. checkpoint build_beamer_latex:
  330. input:
  331. extra_preamble='extra-preamble.latex',
  332. mkdn_file='{basename}.mkdn',
  333. # images=lambda wildcards: get_mkdn_included_images('{basename}.mkdn'.format(**wildcards)),
  334. # pdfs=lambda wildcards: get_mkdn_included_pdfs('{basename}.mkdn'.format(**wildcards)),
  335. output:
  336. latex='{basename,presentation.*}.tex',
  337. # TODO: should work with shadow minimal but doesn't
  338. run:
  339. beamer_latex = pypandoc.convert_file(
  340. input.mkdn_file, 'beamer', format='md',
  341. extra_args = [
  342. '-H', input.extra_preamble,
  343. '--pdf-engine=xelatex',
  344. ])
  345. # Center all columns vertically
  346. beamer_latex = beamer_latex.replace(r'\begin{columns}[T]', r'\begin{columns}[c]')
  347. with open(output.latex, 'w') as latex_output:
  348. latex_output.write(beamer_latex)
  349. rule build_beamer_pdf:
  350. input:
  351. latex='{basename}.tex',
  352. gfx=lambda wildcards: get_latex_included_gfx('{basename}.tex'.format(**wildcards)),
  353. output:
  354. pdf='{basename,presentation.*}.pdf'
  355. shadow: 'minimal'
  356. run:
  357. shell('''xelatex {input.latex:q} </dev/null''')
  358. print_pdfinfo(output.pdf)
  359. rule build_all_presentations:
  360. input:
  361. 'presentation.pdf',
  362. 'presentation.pptx',
  363. rule make_transplant_organs_graph:
  364. input:
  365. Rscript='graphics/presentation/transplants-organ.R',
  366. data='graphics/presentation/transplants-organ.xlsx',
  367. output:
  368. pdf='graphics/presentation/transplants-organ.pdf'
  369. shell: '''
  370. Rscript 'graphics/presentation/transplants-organ.R'
  371. '''