Replacing abbreviations via Python and regular expressions

When gambling around with NLP and text corpora in Python, I frequently face simple abbreviations like “z.B.”, the german equivalent of “i.e.”, which I sometimes want to expand. This is a handy (german) utility-method expanding the defined abbrevations, via regular expressions, in order to do so:

def expand_abbreviations(text):
    if expand_abbreviations.__replacements is None:
        abbrevation_mappings = {\
            "Abb." : "Abbildung",\
            "Abschn." : "Abschnitt",\
            "Alt." : "Alternative",\
            "Anm." : "Anmerkung",\
            "Aufl." : "Auflage",\
            "bsp." : "Beispiel",\
            "bspw." : "beispielsweise",\
            "bzgl." : "bezüglich",\
            "ca." : "circa",\
            "d.h." : "das heißt",\
            "etc." : "et cetera",\
            "evtl." : "eventuell",\
            "f." : "für",\
            "ff." : "fortfolgend",\
            "ggf." : "gegebenenfalls",\
            "Hrsg." : "Herausgeber",\
            "i.d.R." : "in der Regel",\
            "inkl." : "inklusive",\
            "max." : "maximal",\
            "min." : "minimal",\
            "Mio." : "Million",\
            "Mrd." : "Milliarde",\
            "Nr." : "Nummer",\
            "o.ä." : "oder ähnliches",\
            "Pos." : "Position",\
            "Rd." : "rund",\
            "S." : "Seite",\
            "s." : "siehe",\
            "Tab." : "Tabelle",\
            "Tel." : "Telefon",\
            "u.ä." : "und ähnliches",\
            "u.a." : "unter anderem",\
            "usw." : "und so weiter",\
            "v.a." : "vor allem",\
            "vgl." : "vergleiche",\
            "z.B." : "zum Beispiel",\
            "z.H." : "zu Händen",\
            "z.T." : "zum Teil",\
        replacements = []
        for abbrevation in abbrevation_mappings.keys():
            escaped_re = r"\b" + re.sub(r"\.([^.]+)", r"\.\\s*\g<1>", abbrevation) + r"(?=[\s:,;?!\-])"
            re_needle = re.compile(escaped_re)
            re_replacement = abbrevation_mappings[abbrevation]
            replacements.append((re_needle, re_replacement))
        expand_abbreviations.__replacements = replacements
    for abbrevation in expand_abbreviations.__replacements:
        text = abbrevation[0].sub(abbrevation[1], text)
    return text
expand_abbreviations.__replacements = None # required property initialization for lookup-preservation

The method initializes a lookup only once, in the first call, and then re-uses that lookup. The method also handles whitespaces, so “z.B.” and “z. B.” are treated equally. Example:

txt_in = "Anm.: die Abb. selbst erinnert z.   T., zumindest mich, u.a. an einen Kupferstich o.ä.!"
txt_out = expand_abbreviations(txt_in)
# "Anmerkung: die Abbildung selbst erinnert zum Teil, zumindest mich, unter anderem an einen Kupferstich oder ähnliches!"
Tagged with: , , ,
Posted in Development, Python