# -*- coding: utf-8 -*-
import attr
from attr.validators import optional, instance_of
import re
from six import text_type as unicode
import enum
import itertools
@enum.unique
[docs]class PosCategory(enum.Enum):
"""Enumeration of possible POS tags."""
UNUSED_POSTAG = 0
# Adverbs, and comparative/superlative forms.
RB = 1
RBR = 2
RBS = 3
# (Common) nouns.
NN = 4
NNS = 5
# Adjectives in various forms (incl comparative & superlative).
JJ = 6
JJR = 7
JJS = 8
# Verbs in various conjugations.
VB = 9
VBD = 10
VBG = 11
VBP = 12
VBN = 13
VBZ = 14
# Proper nouns.
NNP = 15
NNPS = 16
# Discourse markers.
LS = 20
FW = 21
UH = 22
# Various closed-class items.
DT = 50 # Determiners.
EX = 51 # Existential there.
CD = 52 # Count determiner?
MD = 53
IN = 54
TO = 55
OF = 56
PRP = 57
CC = 58
PDT = 59
WRB = 60
WDT = 61
WP = 62
RP = 63
# Symbol?
ABBREVIATION = 80
SYM = 81
PUNC = 82
@enum.unique
[docs]class EntityCategory(enum.Enum):
UNSPECIFIED_ENTITY = 0
PRODUCT = 1
CITY = 2
SURNAME = 3
EVENT = 4
COUNTRY = 5
CONTINENT = 6
PERSON = 7
ORGANIZATION = 8
COMPANY = 9
PLACE = 10
STATE = 11
MONTH = 12
BOYNAME = 13
GIRLNAME = 14
_scored_patt = re.compile(r'_\d\.\d+$')
def _clean_tag(t):
"""Fix up some garbage errors."""
# TODO: when score present, include info.
t = _scored_patt.sub(string=t, repl='')
if t == '_country_' or t.startswith('_country:'):
t = 'nnp_country'
elif t == 'vpb':
t = 'vb' # "carjack" is listed with vpb tag.
elif t == 'nnd':
t = 'nns' # "abbes" is listed with nnd tag.
elif t == 'nns_root:':
t = 'nns' # 'micros' is listed as nns_root.
elif t == 'root:zygote':
t = 'nn' # 'root:zygote' for zygote. :-/
elif t.startswith('root:'):
t = 'uh' # Don't know why, but these are all UH tokens.
elif t in ('abbr_united_states_marine_corps', 'abbr_orange_juice'):
t = "abbreviation"
elif t == '+abbreviation':
t = 'abbreviation'
elif t.startswith('fw_misspelling:'):
t = 'fw'
return t
@attr.s
[docs]class Pos(object):
"""Contains part of speech information and (possibly) entity category."""
category = attr.ib(validator=instance_of(PosCategory))
entity_type = attr.ib(default=None,
validator=optional(instance_of(EntityCategory)))
@classmethod
[docs] def from_string(cls, t):
# Extract some systematic structure from the data.
postag = t
entity = None
if t.startswith('nnp_'):
postag = PosCategory.NNP
entity = EntityCategory[t[4:].upper()]
elif t.startswith('nnps_'):
postag = PosCategory.NNPS
entity = EntityCategory[t[5:].upper()]
else:
postag = PosCategory[t.upper()]
entity = None
return cls(category=postag, entity_type=entity)
[docs] def to_string(self):
out = self.category.name.lower()
if self.entity_type is not None:
out += u'_' + self.entity_type.name.lower()
return out
@attr.s
[docs]class Morph(object):
"""Contains a morphological analysis of the corresponding ortho.
May be more than one in a word, e.g. axes -> (ax + s), (axe + s)
"""
emes = attr.ib(validator=instance_of(tuple)) # of strings
@classmethod
[docs] def from_string(cls, s):
morphemes = (m for m in s.split('+') if len(m))
return cls(emes=tuple(morphemes))
[docs] def to_string(self):
return u'+' + u'+'.join(self.emes)
@attr.s
[docs]class Phone(object):
"""Contains a cluster of IPA characters indicating a single phone(me)."""
value = attr.ib(instance_of(unicode))
@attr.s
[docs]class Syllable(object):
phones = attr.ib(instance_of(tuple)) # of strings? of Phones?
@classmethod
[docs] def from_string(cls, s):
return cls(phones=tuple(Phone(value=p) for p in s.split()))
@property
def ipa(self):
return tuple(ph.value for ph in self.phones)
[docs] def to_string(self):
return u" ".join(ph.value for ph in self.phones)
@attr.s
[docs]class Pron(object):
sylls = attr.ib(validator=instance_of(tuple))
@classmethod
[docs] def from_string(cls, s, clean=False):
s = s.strip()
raw_sylls = s.split(u' . ')
if clean:
raw_sylls = [p.replace(u"ɛ̃", u"ɛ") for p in raw_sylls]
# TODO: break into phones?
return cls(sylls=tuple(Syllable.from_string(p) for p in raw_sylls))
@property
def ipa(self):
return tuple(itertools.chain.from_iterable(syll.ipa
for syll in self.sylls))
[docs] def to_string(self):
return u' . '.join(syll.to_string() for syll in self.sylls)
@attr.s
[docs]class Word(object):
ortho = attr.ib()
pos = attr.ib(validator=instance_of(tuple)) # of Pos
morphs = attr.ib(validator=instance_of(tuple)) # of Morph objects
# TODO: validate that each element is instance of...
prons = attr.ib(validator=instance_of(tuple)) # of Prons
ortho_patt = re.compile(r'^([^\(]+?)\((.*)\)\s*$')
@classmethod
[docs] def from_string(cls, s, clean=False):
s = s.strip()
raw_prons = s.split(u'#')
while not raw_prons[-1]:
raw_prons.pop(-1)
if len(raw_prons) < 2:
raise ValueError("string doesn't have enough segments: %s" % s)
raw_ortho = raw_prons.pop(0)
m = cls.ortho_patt.match(raw_ortho)
if not m:
raise ValueError("ortho doesn't match expected" % raw_ortho)
ortho, raw_pos = m.groups()
all_morphs = []
all_pos = []
for t in raw_pos.split(','):
if clean:
t = _clean_tag(t)
if not t:
continue
if t.startswith('+'):
all_morphs.append(Morph.from_string(t))
else:
try:
all_pos.append(Pos.from_string(t))
except KeyError as k:
raise ValueError("pos tag %s not found" % k)
all_prons = [Pron.from_string(raw_pron, clean=clean)
for raw_pron in raw_prons]
return cls(ortho=ortho, morphs=tuple(all_morphs), pos=tuple(all_pos),
prons=tuple(all_prons))
[docs] def to_string(self):
morph_pos = ','.join([m.to_string() for m in self.morphs + self.pos])
key = u'%s(%s)' % (self.ortho, morph_pos)
return u' # '.join([key] + [p.to_string() for p in self.prons])
@property
def ipa(self):
return tuple(itertools.chain.from_iterable(
pron.ipa for pron in self.prons))