#!/usr/bin/env python # -*- coding: utf-8 -*- """ Convert InDesign tagged text to ConTeXt """ import sys, os import re quote = u'$&_%' rePatterns = { # paragraph styles ur'^((\d\.)*\s+)?(.+)$' : ur'\\chapter{\3}\n', ur'^((\d\.)*\s+)?(.+)$' : ur'\\section{\3}\n', ur'^((\d\.)*\s+)?(.+)$' : ur'\\subsection{\3}\n', ur'^((\d\.)*\s+)?(.+)$' : ur'\\subsubsection{\3}\n', # character styles ur'(.+?)' : ur'{\\bf \1}', #ur'(.*?)' : ur'\\otherfont{\1}', u'<.*?>' : u'', # delete all other tags # lines that start with dotted numbers = section titles ur'^\d+\s+(.+)$' : ur'\\chapter{\1}\n', ur'^\d+\.\d+\.?\s+(.+)$' : ur'\\section{\1}\n', ur'^\d+\.\d+\.\d+\.?\s+(.+)$' : ur'\\subsection{\1}\n', ur'^\d+\.\d+\.\d+\.\d+\.?\s+(.+)\$' : ur'\\subsubsection{\1}\n', ur'^(\s*)[–\-·•]\s+' : ur'\1\\item\t', # itemization (lines starting with bullet etc.) ur'^(\s*)(\d+)\.?\)\s+' : ur'\1\\item[\2]\t', # itemization (numerical) ur'([Zusovz])\.([Baguo])\.' : ur'\1.\\,\2.', # u.a., s.o., o.g., z.B. ur'[„"“](.*?)[“”"]' : ur'\\quotation{\1}', # German quotation ur'[\'’,](.*?)[\'’‘]' : ur'\\quote{\1}', # German single quotation #ur'"(.*?)"' : ur'\\quotation{\1}', # quotation? ur' (\.\?\!:;)' : ur'\1', # spaces in front of punctuation ur'{\\em\s+}' : ur'', # empty emphasizing ur' (%|°)' : ur'\\,\1', # spaces in front of measure units u' - ' : u' – ', # en dash ur'(\d{4})\s*(\-|–)\s*(\d{4})' : ur'\1–\3', # year numbers u' +' : u' ', # multiple spaces u'^\s+$' : u'\n', # make empty lines really empty # ur'' : ur'', } reres = {} status = { 'item' : False } # collect parameters if len(sys.argv) > 1: sourcename = sys.argv[1] if len(sys.argv) > 2: targetname = sys.argv[2] else: targetname = sourcename.replace('.txt', '.tex') else: print "file name?" sys.exit() # compile regular expressions for k in rePatterns: p = re.compile(k) reres[p] = rePatterns[k] source = open(sourcename, 'rU') target = open(targetname, 'w') # convert lines for line in source.readlines(): line = unicode(line, 'utf-16be') # "unicode" encoded InDesign tagged text is UTF-16 big-endian encoded! for p in reres: line = p.sub(reres[p], line) for c in quote: line = line.replace(c, u'\\'+c) if '\\item ' in line and not status['item']: target.write('\\startitemize[]\n') status['item'] = True if status['item'] and not '\\item ' in line: target.write('\\stopitemize\n') status['item'] = False target.write(line.encode('utf-8')) # write UTF-8 source.close() target.close() print "%s completed" % targetname