# # The Latex Codec handles the encoding from UFT-8 text to latin1 # latex compatible text. # import re import codecs import unient # Dictionnary of the handlers installed tex_handler_installed = {} tex_handler_counter = {} def latex_char_replace(exc, pre, post, name): global tex_handler_counter if not isinstance(exc, UnicodeEncodeError): raise TypeError("don't know how to handle %r" % exc) l = [] n = tex_handler_counter[name] for c in exc.object[exc.start:exc.end]: if pre: l.append(pre) try: l.append(unient.unicode_map[ord(c)]) except KeyError: print "Missing character &#x%x;" % ord(c) l.append(u"\&\#x%x;" % ord(c)) if post: l.append(post) n = n + 1 tex_handler_counter[name] = n return (u"".join(l), exc.end) class TexCodec: # This mapping for characters < 256 seems enough for latin1 output charmap = { "\xa0": r"~", # "\xa2": r"\textcent{}", # "\xa4": r"\textcurrency{}", "\xa5": r"$\yen$", # "\xa6": r"\textbrokenbar{}", "\xac": r"\ensuremath{\lnot}", # "\xad": r"", # FIXME: bug around soft hyphen... "\xb0": r"\textdegree{}", "\xb1": r"\ensuremath{\pm}", "\xb2": r"$^2$", "\xb3": r"$^3$", "\xb5": r"$\mathrm{\mu}$", "\xb9": r"$^1$", "\xd7": r"$\times$", "\xf7": r"$\div$" } def __init__(self, input_encoding="utf8", output_encoding="latin-1", errors="latexcharreplace", pre="", post=""): self._errors = errors self._decode = codecs.getdecoder(input_encoding) self._encode = codecs.getencoder(output_encoding) # No different output encoding expected? if input_encoding == output_encoding: self.charmap = {} return if not(tex_handler_installed.has_key(self._errors)): f = self.build_error_func(pre, post, errors) codecs.register_error(self._errors, f) tex_handler_installed[self._errors] = f self.clear_errors() def clear_errors(self): tex_handler_counter[self._errors] = 0 def get_errors(self): return tex_handler_counter[self._errors] def build_error_func(self, pre="", post="", errors="charrep"): return lambda exc: latex_char_replace(exc, pre, post, errors) def decode(self, text): return self._decode(text)[0] def encode(self, text): text = self._encode(text, self._errors)[0] for c, v in self.charmap.items(): text = text.replace(c, v) return text class LatexCodec(TexCodec): def __init__(self, input_encoding="utf8", output_encoding="latin-1"): TexCodec.__init__(self, input_encoding, output_encoding) self.texres = ( # Kind of normalize (re.compile("^[\s\n]*$"), r" "), # TeX escapes (the order is important) (re.compile(r"([{}%_^$&#])"), r"\\\1"), # '<' and '>' in the list to avoid french quotation mark symptoms (re.compile(r"([-^<>])"), r"\1{}"), # backtick (`) must not be confused with ‘ (re.compile(r"`"), r"\\`{}"), # tilde (~) must not be confused with   (re.compile(r"~"), r"\\textasciitilde{}")) def _texescape(self, text): for r, s in self.texres: text = r.sub(s, text) return text def encode(self, text): # Preliminary backslash substitution text = text.replace("\\", r"\textbackslash") # Basic TeX escape text = self._texescape(text) # Encode UTF-8 -> Latin-1 + latex specific text = self._encode(text, self._errors)[0] # Special Character Mapping for c, v in self.charmap.items(): text = text.replace(c, v) # Things are done, complete with {} text = text.replace(r"\textbackslash", r"\textbackslash{}") return text def main(): import sys c = LatexCodec() f = open(sys.argv[1]) text = "" for line in f: text += c.encode(c.decode(line)) if text: sys.stdout.write(text) if __name__ == "__main__": main()