Source code for islex.tokens

# -*- coding: utf-8 -*-

import attr
from attr.validators import optional, instance_of
import re
from six import text_type as unicode
import enum
import itertools


@enum.unique
[docs]class PosCategory(enum.Enum): """Enumeration of possible POS tags.""" UNUSED_POSTAG = 0 # Adverbs, and comparative/superlative forms. RB = 1 RBR = 2 RBS = 3 # (Common) nouns. NN = 4 NNS = 5 # Adjectives in various forms (incl comparative & superlative). JJ = 6 JJR = 7 JJS = 8 # Verbs in various conjugations. VB = 9 VBD = 10 VBG = 11 VBP = 12 VBN = 13 VBZ = 14 # Proper nouns. NNP = 15 NNPS = 16 # Discourse markers. LS = 20 FW = 21 UH = 22 # Various closed-class items. DT = 50 # Determiners. EX = 51 # Existential there. CD = 52 # Count determiner? MD = 53 IN = 54 TO = 55 OF = 56 PRP = 57 CC = 58 PDT = 59 WRB = 60 WDT = 61 WP = 62 RP = 63 # Symbol? ABBREVIATION = 80 SYM = 81 PUNC = 82
@enum.unique
[docs]class EntityCategory(enum.Enum): UNSPECIFIED_ENTITY = 0 PRODUCT = 1 CITY = 2 SURNAME = 3 EVENT = 4 COUNTRY = 5 CONTINENT = 6 PERSON = 7 ORGANIZATION = 8 COMPANY = 9 PLACE = 10 STATE = 11 MONTH = 12 BOYNAME = 13 GIRLNAME = 14
_scored_patt = re.compile(r'_\d\.\d+$') def _clean_tag(t): """Fix up some garbage errors.""" # TODO: when score present, include info. t = _scored_patt.sub(string=t, repl='') if t == '_country_' or t.startswith('_country:'): t = 'nnp_country' elif t == 'vpb': t = 'vb' # "carjack" is listed with vpb tag. elif t == 'nnd': t = 'nns' # "abbes" is listed with nnd tag. elif t == 'nns_root:': t = 'nns' # 'micros' is listed as nns_root. elif t == 'root:zygote': t = 'nn' # 'root:zygote' for zygote. :-/ elif t.startswith('root:'): t = 'uh' # Don't know why, but these are all UH tokens. elif t in ('abbr_united_states_marine_corps', 'abbr_orange_juice'): t = "abbreviation" elif t == '+abbreviation': t = 'abbreviation' elif t.startswith('fw_misspelling:'): t = 'fw' return t @attr.s
[docs]class Pos(object): """Contains part of speech information and (possibly) entity category.""" category = attr.ib(validator=instance_of(PosCategory)) entity_type = attr.ib(default=None, validator=optional(instance_of(EntityCategory))) @classmethod
[docs] def from_string(cls, t): # Extract some systematic structure from the data. postag = t entity = None if t.startswith('nnp_'): postag = PosCategory.NNP entity = EntityCategory[t[4:].upper()] elif t.startswith('nnps_'): postag = PosCategory.NNPS entity = EntityCategory[t[5:].upper()] else: postag = PosCategory[t.upper()] entity = None return cls(category=postag, entity_type=entity)
[docs] def to_string(self): out = self.category.name.lower() if self.entity_type is not None: out += u'_' + self.entity_type.name.lower() return out
@attr.s
[docs]class Morph(object): """Contains a morphological analysis of the corresponding ortho. May be more than one in a word, e.g. axes -> (ax + s), (axe + s) """ emes = attr.ib(validator=instance_of(tuple)) # of strings @classmethod
[docs] def from_string(cls, s): morphemes = (m for m in s.split('+') if len(m)) return cls(emes=tuple(morphemes))
[docs] def to_string(self): return u'+' + u'+'.join(self.emes)
@attr.s
[docs]class Phone(object): """Contains a cluster of IPA characters indicating a single phone(me).""" value = attr.ib(instance_of(unicode))
@attr.s
[docs]class Syllable(object): phones = attr.ib(instance_of(tuple)) # of strings? of Phones? @classmethod
[docs] def from_string(cls, s): return cls(phones=tuple(Phone(value=p) for p in s.split()))
@property def ipa(self): return tuple(ph.value for ph in self.phones)
[docs] def to_string(self): return u" ".join(ph.value for ph in self.phones)
@attr.s
[docs]class Pron(object): sylls = attr.ib(validator=instance_of(tuple)) @classmethod
[docs] def from_string(cls, s, clean=False): s = s.strip() raw_sylls = s.split(u' . ') if clean: raw_sylls = [p.replace(u"ɛ̃", u"ɛ") for p in raw_sylls] # TODO: break into phones? return cls(sylls=tuple(Syllable.from_string(p) for p in raw_sylls))
@property def ipa(self): return tuple(itertools.chain.from_iterable(syll.ipa for syll in self.sylls))
[docs] def to_string(self): return u' . '.join(syll.to_string() for syll in self.sylls)
@attr.s
[docs]class Word(object): ortho = attr.ib() pos = attr.ib(validator=instance_of(tuple)) # of Pos morphs = attr.ib(validator=instance_of(tuple)) # of Morph objects # TODO: validate that each element is instance of... prons = attr.ib(validator=instance_of(tuple)) # of Prons ortho_patt = re.compile(r'^([^\(]+?)\((.*)\)\s*$') @classmethod
[docs] def from_string(cls, s, clean=False): s = s.strip() raw_prons = s.split(u'#') while not raw_prons[-1]: raw_prons.pop(-1) if len(raw_prons) < 2: raise ValueError("string doesn't have enough segments: %s" % s) raw_ortho = raw_prons.pop(0) m = cls.ortho_patt.match(raw_ortho) if not m: raise ValueError("ortho doesn't match expected" % raw_ortho) ortho, raw_pos = m.groups() all_morphs = [] all_pos = [] for t in raw_pos.split(','): if clean: t = _clean_tag(t) if not t: continue if t.startswith('+'): all_morphs.append(Morph.from_string(t)) else: try: all_pos.append(Pos.from_string(t)) except KeyError as k: raise ValueError("pos tag %s not found" % k) all_prons = [Pron.from_string(raw_pron, clean=clean) for raw_pron in raw_prons] return cls(ortho=ortho, morphs=tuple(all_morphs), pos=tuple(all_pos), prons=tuple(all_prons))
[docs] def to_string(self): morph_pos = ','.join([m.to_string() for m in self.morphs + self.pos]) key = u'%s(%s)' % (self.ortho, morph_pos) return u' # '.join([key] + [p.to_string() for p in self.prons])
@property def ipa(self): return tuple(itertools.chain.from_iterable( pron.ipa for pron in self.prons))