Source code for cltoolkit.util

"""
Utility functions for lexicore.
"""
import pathlib
import functools

from lingpy.sequence.sound_classes import syllabify
from lingpy.basictypes import lists
from pycldf import Dataset
from pycldf.util import DictTuple as BaseDictTuple

__all__ = [
    'valid_sounds', 'identity', 'jaccard', 'iter_syllables',
    'DictTuple', 'NestedAttribute', 'MutatedDataValue', 'MutatedNestedDictValue']


def valid_sounds(sounds):
    """
    Make sure tokens conform to transcription system.

    :param sounds: List of Sound objects (pyclts.Sound).
    """
    if not sounds:
        return []
    tokens = [s for s in sounds]
    while str(tokens[0]) in ["+", "_"]:
        tokens = tokens[1:]
    while str(tokens[-1]) in ["+", "_"]:
        tokens = tokens[:-1]
    out = []
    for i, token in enumerate(tokens):
        if str(tokens[i]) in ["+", "_"] and i > 0 and str(tokens[i - 1]) in ["+", "_"]:
            pass
        elif str(token) == "_":
            out.append("+")
        elif token.type == 'unknownsound':
            return []
        else:
            out.append(str(token))
    return lists(out)


def identity(x):
    """
    Identity function used as a default for passing functions.
    """
    return x


def jaccard(a, b):
    """
    Returns the Jaccard distance between two sets.
    """
    i, u = len(a.intersection(b)), len(a.union(b))
    return i / u if u else 0


def iter_syllables(form):
    """
    Return the syllables of a given form with tokens.
    """
    for morpheme in form.sounds.n:
        for syllable in syllabify(morpheme, output='nested'):
            yield syllable


class NestedAttribute:
    """
    A descriptor implementing a nested attribute getter.

    Used to implement Facade-pattern-style access to complex attribute data.

    .. code-block:: python

        >>> class C:
        ...     a = 'ABC'
        ...     b = NestedAttribute('a', 'lower')
        ...
        >>> C().b()
        'abc'

    .. seealso:: https://en.wikipedia.org/wiki/Facade_pattern
    """
    def __init__(self, outer_attribute, inner_attribute):
        self._outer = outer_attribute
        self._inner = inner_attribute

    def __get__(self, obj, objtype=None):
        return getattr(getattr(obj, self._outer), self._inner, None)


class MutatedNestedDictValue:
    """
    Descriptor to retrieve a mutated value of a nested `dict`.

    Used to implement Facade-pattern-style access to complex attribute data.

    .. code-block:: python

        >>> class C:
        ...     a = {'x': 5}
        ...     b = MutatedNestedDictValue('a', 'x', transform=lambda x: x + 5)
        ...
        >>> C().b
        10

    .. seealso:: https://en.wikipedia.org/wiki/Facade_pattern
    """
    def __init__(self, attribute, key, transform=identity):
        self.transform = transform
        self.attr, self.key = attribute, key

    def __get__(self, obj, objtype=None):
        return self.transform(getattr(obj, self.attr).get(self.key, None))


MutatedDataValue = functools.partial(MutatedNestedDictValue, 'data')


[docs]class DictTuple(BaseDictTuple):
    """
    An object allowing access to items of a `tuple` as if it were a `dict` keyed with the `id`
    attribute of the contained objects.
    """
    def get(self, item, default=None):
        try:
            return self.__getitem__(item)
        except KeyError:
            return default

    def __getitem__(self, item):
        if not isinstance(item, (int, slice)):
            if item not in self._d:
                raise KeyError(item)
        return super(DictTuple, self).__getitem__(item)

    def __contains__(self, item):
        return getattr(item, 'id', item) in self._d

    def items(self):
        for k, v in self._d.items():
            yield k, self[v[0]]


def datasets_by_id(*ids, path='*/*/cldf/cldf-metadata.json', base_dir="."):
    """
    Return `pycldf` dataset instances by searching for their identifiers.
    """
    datasets = []
    for path in pathlib.Path(base_dir).glob(path):
        if any(did in str(path) for did in ids):
            datasets.append(Dataset.from_metadata(path))
    return datasets


def lingpy_columns(**kw):
    """
    Define basic columns for export to LingPy wordlists.
    """
    base = [
        (("form", "id"), "local_id"),
        (("language", "id"), "doculect"),
        (("concept", "id"), "concept"),
        (("sense", "name"), "concept_in_source"),
        (("form", "value"), "value"),
        (("form", "form"), "form"),
        (("form", "sounds"), "tokens")]
    if "cognates" in kw:
        base += [(("cognates", kw["cognates"]), "cognacy")]

    return base