Re: converters (was: TexPaste alpha)

From: Henning Hraban Ramm <hraban@fiee.net>
To: mailing list for ConTeXt users <ntg-context@ntg.nl>
Subject: Re: converters (was: TexPaste alpha)
Date: Fri, 29 May 2009 10:14:28 +0200	[thread overview]
Message-ID: <FA19029D-43FD-4D2C-A04A-F449D91903E5@fiee.net> (raw)
In-Reply-To: <fe8d59da0905280045p86e8027xa0ea8c55e2c08569@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 882 bytes --]

Am 2009-05-28 um 09:45 schrieb luigi scarso:

> I guess I should build a new converter suite (there's also a  
> InDesign Tags to ConTeXt converter anywhere on my harddisk).
> But I won't make GUI apps, just scripts.
> That's sound good !
> If in python, even better !
> If only scripts, the best !
>
> Can we have more details ?

Which conversion do you need?

If it's InDesign to ConTeXt, there's always custom programming needed  
- e.g. you need to know what ID paragraph style should become what  
ConTeXt section. (sample attached)

I'm not good in building parsers, using mostly regular expression  
replacements, so my converters are always limited, and manual cleanup  
is necessary - but they save a lot of manual work anyway!

Greetlings from Lake Constance!
Hraban
---
http://www.fiee.net/texnique/
http://wiki.contextgarden.net
https://www.cacert.org (I'm an assurer)

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: latin1_to_utf8.py --]
[-- Type: text/x-python-script; x-unix-mode=0755; x-mac-type=54455854; name="latin1_to_utf8.py", Size: 3874 bytes --]

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Universelle Textcodierung
2009-03-10 by Henning Hraban Ramm, fiëe virtuëlle

quellcodierung_to_zielcodierung.py [Optionen] Quelldatei [Zieldatei]

Es können auch ganze Verzeichnisse bearbeitet werden.

Optionen:
--filter=Dateiendung
--overwrite          (sonst wird die Originaldatei gesichert)
--hidden             (sonst werden versteckte Dateien ignoriert)
"""

import os, os.path, sys, codecs, getopt, shutil
try:
    import latex
except:
    pass

modes = ('filter', 'overwrite', 'hidden')
mode = {}

def help(message=""):
    print message
    print __doc__
    sys.exit(1)

def backup(datei):
    original = datei
    pfad, datei = os.path.split(datei)
    datei, ext = os.path.splitext(datei)
    count = 0
    while os.path.exists(os.path.join(pfad, "%s.%d%s" % (datei, count, ext))):
        count += 1
    neudatei = os.path.join(pfad, "%s.%d%s" % (datei, count, ext))
    print "Sichere %s als %s" % (original, neudatei)
    shutil.copy(original, neudatei)
    return neudatei

def is_hidden(datei):
	return (datei.startswith('.') or os.sep+'.' in datei)

def convert(source, target, so_enc, ta_enc):
    from_exists = os.path.exists(source)
    to_exists = os.path.exists(target)
    from_isdir = os.path.isdir(source)
    to_isdir = os.path.isdir(target)
    from_path, from_name = os.path.split(source)
    to_path, to_name = os.path.split(target)
    #from_name = os.path.basename(source)
    #to_name = os.path.basename(target)

    if not from_exists:
    	help("Quelle '%s' nicht gefunden!" % from_name)

    if from_isdir:
    	if is_hidden(source) and not mode['hidden']:
    		print "Ignoriere verstecktes Verzeichnis %s" % source
    		return
        if not to_isdir:
            help("Wenn die Quelle ein Verzeichnis ist, muss auch das Ziel ein Verzeichnis sein!")
    	print "Verarbeite Verzeichnis %s" % source
        dateien = os.listdir(source)
        #if not mode['hidden']:
        #	dateien = [d for d in dateien if not is_hidden(d)]
        if mode['filter']:
            dateien = [d for d in dateien if d.endswith(mode['filter'])]
        for datei in dateien:
        	s = os.path.join(source, datei)
        	t = os.path.join(target, datei)
        	convert(s, t, so_enc, ta_enc)
    else:
    	if is_hidden(from_name) and not mode['hidden']:
    		print "Ignoriere versteckte Datei %s" % source
    		return
        if to_isdir:
            target = os.path.join(target, from_name)
        if not mode['overwrite']:
            if source==target:
                source=backup(source)
            elif os.path.exists(target):
                backup(target)
        print "Konvertiere %s (%s)\n\tnach %s (%s)" % (source, so_enc, target, ta_enc)
        so_file = file(source, "rU")
        lines = so_file.readlines()
        so_file.close()
        ta_file = file(target, "w")
        for l in lines:
            ta_file.write(unicode(l, so_enc).encode(ta_enc))
        ta_file.close()

opts, args = getopt.getopt(sys.argv[1:], "ohf:", ["overwrite","hidden","filter="])

if len(args)<1:
    help("Zu wenige Parameter angegeben!")

for m in modes:
    mode[m] = False
    for (o, a) in opts:
        if o=='-'+m[0] or o=='--'+m:
            if a:
                print "Modus %s = %s" % (m, a)
            else:
                a = True
                print "Modus %s aktiv" % m
            mode[m] = a

#print "modes:", mode
#print "opts :", opts
#print "args :", args

# gewünschte Codierung aus dem Dateinamen ablesen
scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
from_enc, to_enc = scriptname.split("_to_")

from_name = to_name = args[0]
if len(args)>1: to_name = args[1]

convert(from_name, to_name, from_enc, to_enc)

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: indtxt2context.py --]
[-- Type: text/x-python-script; x-mac-creator=21526368; x-unix-mode=0644; x-mac-type=54455854; name="indtxt2context.py", Size: 2773 bytes --]

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Convert InDesign tagged text to ConTeXt
"""
import sys, os
import re

quote = u'$&_%'

rePatterns = {
	# paragraph styles
	ur'^<pstyle:Ü 1\.>((\d\.)*\s+)?(.+)$' : ur'\\chapter{\3}\n',
	ur'^<pstyle:Ü 1\.1>((\d\.)*\s+)?(.+)$' : ur'\\section{\3}\n',
	ur'^<pstyle:Ü 1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsection{\3}\n',
	ur'^<pstyle:Ü 1\.1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsubsection{\3}\n',
	# character styles
	ur'<ct:Bold>(.+?)<ct:>' : ur'{\\bf \1}',
	#ur'<cf:Arial>(.*?)<cf:Times New Roman>' : ur'\\otherfont{\1}',

	u'<.*?>' : u'', # delete all other tags

	# lines that start with dotted numbers = section titles
	ur'^\d+\s+(.+)$' : ur'\\chapter{\1}\n',
	ur'^\d+\.\d+\.?\s+(.+)$' : ur'\\section{\1}\n',
	ur'^\d+\.\d+\.\d+\.?\s+(.+)$' : ur'\\subsection{\1}\n',
	ur'^\d+\.\d+\.\d+\.\d+\.?\s+(.+)\$' : ur'\\subsubsection{\1}\n',

	ur'^(\s*)[–\-·•]\s+' : ur'\1\\item\t', # itemization (lines starting with bullet etc.)
	ur'^(\s*)(\d+)\.?\)\s+' : ur'\1\\item[\2]\t', # itemization (numerical)
	ur'([Zusovz])\.([Baguo])\.' : ur'\1.\\,\2.', # u.a., s.o., o.g., z.B.
	ur'[„"“](.*?)[“”"]' : ur'\\quotation{\1}', # German quotation
	ur'[\'’,](.*?)[\'’‘]' : ur'\\quote{\1}', # German single quotation
	#ur'"(.*?)"' : ur'\\quotation{\1}', # quotation?
	ur' (\.\?\!:;)' : ur'\1', # spaces in front of punctuation
	ur'{\\em\s+}' : ur'', # empty emphasizing
	ur' (%|°)' : ur'\\,\1', # spaces in front of measure units
	u' - ' : u' – ', # en dash
	ur'(\d{4})\s*(\-|–)\s*(\d{4})' : ur'\1–\3', # year numbers

	u' +' : u' ', # multiple spaces
	u'^\s+$' : u'\n', # make empty lines really empty

#	ur'' : ur'',

}

reres = {}
status = {
	'item' : False
}

# collect parameters
if len(sys.argv) > 1:
	sourcename = sys.argv[1]
	if len(sys.argv) > 2:
		targetname = sys.argv[2]
	else:
		targetname = sourcename.replace('.txt', '.tex')
else:
	print "file name?"
	sys.exit()

# compile regular expressions
for k in rePatterns:
	p = re.compile(k)
	reres[p] = rePatterns[k]

source = open(sourcename, 'rU')
target = open(targetname, 'w')

# convert lines
for line in source.readlines():
	line = unicode(line, 'utf-16be') # "unicode" encoded InDesign tagged text is UTF-16 big-endian encoded!
	for p in reres:
		line = p.sub(reres[p], line)
	for c in quote:
		line = line.replace(c, u'\\'+c)
	if '\\item ' in line and not status['item']:
		target.write('\\startitemize[]\n')
		status['item'] = True
	if status['item'] and not '\\item ' in line:
		target.write('\\stopitemize\n')
		status['item'] = False
	target.write(line.encode('utf-8')) # write UTF-8

source.close()
target.close()

print "%s completed" % targetname

[-- Attachment #4: Type: text/plain, Size: 487 bytes --]

___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the Wiki!

maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage  : http://www.pragma-ade.nl / http://tex.aanhet.net
archive  : https://foundry.supelec.fr/projects/contextrev/
wiki     : http://contextgarden.net
___________________________________________________________________________________