# # The role of the verbatim parser is to encode properly from UTF-8 verbatim # text to valid latin-1 text. Two goals must be met: # # - Just encode the characters, but don't escape latex characters like in normal # text. This is why a dedicated latex encoder is used. # - When the characters are translated to macros, escape the whole sequence # to allow tex execute the macro embedded in verbatim text. # - When the escape sequence is required, update the listing environment options # if necessary. # import re from texcodec import TexCodec from texcodec import tex_handler_counter from rawparse import RawUtfParser class VerbCodec(TexCodec): def __init__(self, pre, post, errors="verbtex", input_encoding="utf8", output_encoding="latin-1"): self.pre = pre self.post = post self.output_encoding = output_encoding TexCodec.__init__(self, input_encoding, output_encoding, errors=errors, pre=pre, post=post) def decode(self, text): global tex_handler_counter ntext = TexCodec.decode(self, text) if self.output_encoding != "utf8": return ntext # Funnily listings cannot handle unicode characters greater than 255. # The loop just escapes them by wrapping with

 and  and
        # emulates the corresponding encoding exception
        text = ""
        n = tex_handler_counter[self._errors]
        for c in ntext:
            if ord(c) > 255:
                c = self.pre + c + self.post
                n += 1
            text += c
        tex_handler_counter[self._errors] = n
        return text


class VerbParser:
    def __init__(self, output_encoding="latin-1"):
        # The listing environment can be different from 'lstlisting'
        # but the rule is that it must begin with 'lst'
        self.start_re = re.compile(r"\\begin{lst[^}]*}")
        self.stop_re = re.compile(r"\\end{lst[^}]*}")
        self.esc_re = re.compile(r"escapeinside={([^}]*)}{([^}]*)}")
        self.block = ""
        self.encoding = output_encoding
        self.default_esc_start = "<:"
        self.default_esc_stop = ":>"
        self.default_codec = VerbCodec(self.default_esc_start,
                                       self.default_esc_stop,
                                       output_encoding=output_encoding)

    def parse(self, line):
        if not(self.block):
            m = self.start_re.search(line)
            if not(m):
                return line
            else:
                return self.parse_begin(line, m)
        else:
            m = self.stop_re.search(line)
            if not(m):
                return self.block_grow(line)
            else:
                return self.parse_end(line, m)

    def parse_begin(self, line, m):
        preblock = line[:m.start()]
        self.command = line[m.start():m.end()]
        line = line[m.end():]
        # By default, no escape sequence defined yet
        self.esc_start = ""
        self.esc_stop = ""
        self.options = ""

        # If there are some options, look for escape specs
        if line[0] == "[":
            e = line.find("]")+1
            self.options = line[:e]
            line = line[e:]
            m = self.esc_re.search(self.options)
            if m:
                self.esc_start = m.group(1)
                self.esc_stop = m.group(2)

        self.block_grow(line)
        return preblock

    def parse_end(self, line, m):
        self.block_grow(line[:m.start()])

        # The block is complete, find out the codec with escape sequence
        c = self.get_codec()
        c.clear_errors()

        # Now, parse/encode the block
        p = RawUtfParser(codec=c)
        text = p.parse(self.block)

        # Add the escape option if necessary
        if not(self.esc_start) and c.get_errors() != 0:
            escopt = "escapeinside={%s}{%s}" % (c.pre, c.post)
            if self.options:
                if self.options[-2] != ",":
                    escopt = "," + escopt
                self.options = self.options[:-1] + escopt + "]"
            else:
                self.options = "[" + escopt + "]"

        block = self.command + self.options + text + line[m.start():]
        self.block = ""
        return block

    def block_grow(self, line):
        self.block += line
        return ""

    def get_codec(self):
        # Something already specified
        if (self.esc_start):
            if self.esc_start != self.default_esc_start:
                return VerbCodec(self.esc_start, self.esc_stop,
                                 "verbtex" + self.esc_start,
                                 output_encoding=self.encoding)
            else:
                return self.default_codec

        # Find the starting escape sequence that does not occur in verbatim text
        s = self.default_esc_start
        iter = 0
        i = self.block.find(s)
        while (i != -1):
            s = "<" + str(iter) + ":"
            i = self.block.find(s)
            iter += 1

        # By luck the default is enough
        if (s == self.default_esc_start):
            return self.default_codec

        return VerbCodec(s, self.default_esc_stop, "verbtex" + s,
                         output_encoding=self.encoding)


if __name__ == "__main__":
    import sys
    v = VerbParser()
    f = open(sys.argv[1])
    for line in f:
        text = v.parse(line)
        if text:
            sys.stdout.write(text)