Source code for islex.load

# -*- coding: utf-8 -*-

from __future__ import print_function

import collections
import os.path

from six import text_type as unicode

from islex.tokens import Word, PosCategory

ISLE_FILE = '/opt/data/ISLEdict.txt'

# TODO(jkahn): this is obviously only going to work for me.
CHECKOUT_ROOT = '/home/jeremy/src'


def _open_data_package_target(stem):
    islex_path = 'islex-%s' % stem
    package_dir = os.path.join(CHECKOUT_ROOT, islex_path, islex_path)
    if not os.path.exists(package_dir):
        os.makedirs(package_dir)
    f = os.path.join(package_dir, 'entries.txt')
    return open(f, mode='w')


[docs]def write_package_data(): core = _open_data_package_target('core') entities = _open_data_package_target('entities') periphery = _open_data_package_target('periphery') def is_unambiguous_entity(w): ENTITY_CATEGORIES = (PosCategory.ABBREVIATION, PosCategory.NNP, PosCategory.NNPS) return all(pos.category in ENTITY_CATEGORIES for pos in w.pos) for w in stream_from_fh(open(ISLE_FILE, mode='r'), clean=True): if not len(w.pos) and not len(w.morphs): out = periphery elif is_unambiguous_entity(w): out = entities else: # Some entities with other tags will end up in core. out = core out.write(w.to_string().encode('utf-8') + "\n")
[docs]def stream_from_fh(fh, clean=False): for l in fh: l = l.decode('utf-8') try: yield Word.from_string(unicode(l), clean=clean) except ValueError as v: print(unicode(v).encode('utf-8')) continue
[docs]class ReadOnlyMapping(collections.Mapping): def __init__(self, backing_store): assert isinstance(backing_store, collections.Mapping) self._store = backing_store def __getitem__(self, key): return self._store[key] def __iter__(self): return iter(self._store) def __len__(self): return len(self._store)
[docs]class CaseInsensitiveMapping(ReadOnlyMapping): # Note: assumes that all keys in backing store are already lowercased def __getitem__(self, key): return self._store[key.lower()]
MEMOIZED_MAPPINGS = {}
[docs]def ortho_mapping(module): if module not in MEMOIZED_MAPPINGS: d = dict() for w in module.entries_stream(): orth = w.ortho.lower() if orth not in d: d[orth] = [] d[orth].append(w) MEMOIZED_MAPPINGS[module] = CaseInsensitiveMapping(backing_store=d) return MEMOIZED_MAPPINGS[module]