Source code for cltoolkit.models

"""
Basic models.
"""
import typing
import statistics
import collections

import attr
import lingpy
from clldutils.misc import lazyproperty as cached_property
import pyclts
from pyclts.models import Sound as CLTSSound, Symbol, Cluster, Consonant

from cltoolkit.util import NestedAttribute, DictTuple, jaccard, MutatedDataValue


[docs]@attr.s(repr=False) class CLCore: """ Base class to represent data in a wordlist. """ id = attr.ib() wordlist = attr.ib(default=None) data = attr.ib(default=None) def __repr__(self): return "<" + self.__class__.__name__ + " " + self.id + ">"
[docs]@attr.s class WithForms: """ Mixin to represent data in a wordlist that contains forms. """ forms = attr.ib(default=None) @cached_property def forms_with_sounds(self): return DictTuple([f for f in self.forms if f.sounds]) @cached_property def forms_with_graphemes(self): return DictTuple([f for f in self.forms if f.graphemes])
[docs]@attr.s class WithDataset: """ Mixin to represent data in a wordlist from a specific dataset. """ obj = attr.ib(default=None, repr=False) dataset = attr.ib(default=None, repr=False)
[docs]@attr.s(repr=False) class Language(CLCore, WithForms, WithDataset): """ Base class for handling languages. :ivar senses: `DictTuple` of senses, i.e. glosses for forms. :ivar concepts: `DictTuple` of senses with explicit Concepticon mapping. :ivar glottocode: `str`, Glottocode for the language. .. note:: A language variety is defined for a specific dataset only. """ senses = attr.ib(default=None) concepts = attr.ib(default=None) glottocode = MutatedDataValue("Glottocode") name = MutatedDataValue("Name") macroarea = MutatedDataValue("Macroarea") latitude = MutatedDataValue("Latitude") longitude = MutatedDataValue("Longitude") family = MutatedDataValue("Family") subgroup = MutatedDataValue("SubGroup") @cached_property def sound_inventory(self): sounds = [] for sound in self.wordlist.sounds: if self.id in sound.occurrences: sounds.append(Sound.from_sound(sound, language=self)) return Inventory(language=self, ts=self.wordlist.ts, sounds=DictTuple(sounds))
[docs]@attr.s(repr=False, eq=False) class Sense(CLCore, WithForms, WithDataset): """ A sense description (concept in source) which does not need to be linked to the Concepticon. :ivar language: :class:`Language` instance :ivar name: `str`, the gloss .. note:: Unlike senses in a wordlist, which are dataset-specific, concepts in a wordlist are defined for all datasets. """ language = attr.ib(default=None) name = MutatedDataValue("Name") def __repr__(self): return '<Sense ' + self.id + '>' def __eq__(self, other): if isinstance(other, self.__class__): return self.name == other.name return False @classmethod def from_sense(cls, sense, language, forms): return cls( id=sense.id, data=sense.data, obj=sense.obj, forms=forms, dataset=sense.dataset, wordlist=sense.wordlist, language=language)
[docs]@attr.s(repr=False, eq=False) class Concept(CLCore, WithForms): """ Base class for the concepts in a dataset. :ivar language: :class:`Language` instance :ivar name: `str`, the gloss :ivar senses: `iterable` of senses mapped to this concept :ivar concepticon_id: `str` ID of the Concepticon concept set the concept is mapped to. :ivar concepticon_gloss: `str` gloss of the Concepticon concept set the concept is mapped to. .. note:: Unlike senses in a wordlist, which are dataset-specific, concepts in a wordlist are defined for all datasets. As a result, they lack a reference to the original dataset in which they occur, but they have an attribute `senses` which is a reference to the original senses as they occur in different datasets. """ language = attr.ib(default=None) senses = attr.ib(default=None) name = attr.ib(default=None) concepticon_id = attr.ib(default=None) concepticon_gloss = attr.ib(default=None) @classmethod def from_sense(cls, concept, id=None, name=None, forms=None, senses=None): return cls( name=name, id=id, concepticon_id=concept.data.get("Concepticon_ID", ""), concepticon_gloss=concept.data.get("Concepticon_Gloss", ""), forms=forms, senses=senses ) @classmethod def from_concept(cls, concept, forms=None, senses=None): return cls( id=concept.id, name=concept.name, concepticon_id=concept.concepticon_id, concepticon_gloss=concept.concepticon_gloss, senses=senses, forms=forms, ) def __repr__(self): return "<Concept " + self.name + ">"
[docs]@attr.s(repr=False) class Form(CLCore, WithDataset): """ Base class for handling the form part of linguistic signs. :ivar concept: The concept (if any) expressed by the form. :ivar language: The language in which the form occurs. :ivar sense: The meaning expressed by the form. :ivar sounds: The segmented strings defined by the B(road) IPA. :ivar graphemes: The segmented graphemes (possibly not BIPA conform). """ concept = attr.ib(default=None, repr=False) language = attr.ib(default=None, repr=False) sense = attr.ib(default=None, repr=False) #: Sounds (graphemes recognized in the specified transcription system) in the segmented form: sounds = attr.ib(default=attr.Factory(list), repr=False) value = MutatedDataValue("Value") form = MutatedDataValue("Form") #: Graphemes in the segmented form: graphemes = MutatedDataValue("Segments", transform=lingpy.basictypes.lists) cognates = attr.ib(default=attr.Factory(dict), repr=False) @property def sound_objects(self): return [self.wordlist.sounds[str(self.wordlist.ts[t])] for t in self.sounds] @property def grapheme_objects(self): return [self.wordlist.graphemes[self.dataset + '-' + s] for s in self.graphemes or []] def __repr__(self): return "<" + self.__class__.__name__ + " " + self.form + ">"
[docs]@attr.s(repr=False) class Cognate(CLCore, WithDataset): form = attr.ib(default=None, repr=False) contribution = attr.ib(default=None, repr=False)
[docs]@attr.s(repr=False) class Grapheme(CLCore, WithDataset, WithForms): grapheme = attr.ib(default=None) occurrences = attr.ib(default=None) language = attr.ib(default=None) def __str__(self): return self.grapheme
[docs]@attr.s(repr=False, eq=False) class Sound(CLCore, WithForms): """ All sounds in a dataset. """ grapheme = attr.ib(default=None) occurrences = attr.ib(default=None) graphemes_in_source = attr.ib(default=None) language = attr.ib(default=None) obj = attr.ib(default=None) type = NestedAttribute("obj", "type") name = NestedAttribute("obj", "name") featureset = NestedAttribute("obj", "featureset") @classmethod def from_grapheme( cls, grapheme_, grapheme=None, occurrences=None, forms=None, id=None, graphemes_in_source=None, obj=None): return cls( id=id, grapheme=grapheme, wordlist=grapheme_.wordlist, occurrences=occurrences, data=obj.__dict__, graphemes_in_source=graphemes_in_source, forms=forms, obj=obj) def __len__(self): return len(self.occurrences or []) def __str__(self): return self.grapheme def __eq__(self, other): if isinstance(other, self.__class__): return self.grapheme == other.grapheme return False def __repr__(self): return "<" + self.__class__.__name__ + " " + self.grapheme + ">" def similarity(self, other): if self.type not in ["marker", "unknownsound"] and \ other.type not in ["marker", "unknownsound"]: return self.obj.similarity(other.obj) elif self.type in ["marker", "unknownsound"] and other.type in ["marker", "unknownsound"]: if self == other: return 1 return 0 return 0 @classmethod def from_sound(cls, sound, language): return cls( id=str(sound), language=language, data=sound.data, obj=sound.obj, wordlist=sound.wordlist, grapheme=sound.grapheme, occurrences=sound.occurrences[language.id], ) def consonant_or_cluster_attr(self, attribute): if isinstance(self.obj, Consonant): return getattr(self.obj, attribute) if isinstance(self.obj, Cluster): return getattr(self.obj.from_sound, attribute) raise AttributeError(attribute) @property def manner(self): return self.consonant_or_cluster_attr('manner') @property def place(self): return self.consonant_or_cluster_attr('place') @property def ejection(self): return self.consonant_or_cluster_attr('ejection') @property def airstream(self): return self.consonant_or_cluster_attr('airstream')
class GetSubInventoryByType: def __init__(self, types): def select_sounds(inventory): return DictTuple([v for v in inventory if v.type in types]) self.select_sounds = select_sounds def __get__(self, obj, objtype=None): return self.select_sounds(obj.sounds) class GetSubInventoryByProperty(GetSubInventoryByType): def __init__(self, types, properties): GetSubInventoryByType.__init__(self, types) self.properties = properties def __get__(self, obj, objtype=None): out = [] sounds = self.select_sounds(obj.sounds) sound_set = set([sound.grapheme for sound in sounds]) for v in sounds: stripped = obj.ts.features.get( frozenset([s for s in v.featureset if s not in self.properties]) ) if str(stripped) != str(v) and str(stripped) not in sound_set: out += [v] elif str(stripped) == str(v): out += [v] return DictTuple(out) @attr.s class Inventory: language = attr.ib(default=None) sounds = attr.ib(default=None, repr=False) ts = attr.ib(default=None, repr=False) consonants = GetSubInventoryByType(["consonant"]) consonants_by_quality = GetSubInventoryByProperty( ["consonant"], ["long", "ultra-long", "mid-long", "ultra-short"] ) consonant_sounds = GetSubInventoryByType(["consonant", "cluster"]) vowels = GetSubInventoryByType(["vowel"]) vowels_by_quality = GetSubInventoryByProperty( ["vowel"], ["long", "ultra-long", "mid-long", "ultra-short"] ) vowel_sounds = GetSubInventoryByType(["vowel", "diphthong"]) segments = GetSubInventoryByType(["consonant", "vowel", "cluster", "diphthong"]) tones = GetSubInventoryByType(["tone"]) markers = GetSubInventoryByType(["marker"]) clusters = GetSubInventoryByType(["cluster"]) diphthongs = GetSubInventoryByType(["diphthong"]) unknownsounds = GetSubInventoryByType(["unknownsound"]) @classmethod def from_list( cls, ts: pyclts.TranscriptionSystem, *list_of_sounds: typing.Union[CLTSSound, Symbol, str], language=None, wordlist=None, ): sounds = collections.OrderedDict() for itm in list_of_sounds: sound = ts[itm] try: sounds[str(sound)].graphemes_in_source.append(itm) except KeyError: sounds[str(sound)] = Sound( id=str(sound), obj=sound, wordlist=wordlist, grapheme=str(sound), graphemes_in_source=[sound.grapheme], occurrences=[], data=sound.__dict__ ) return cls(sounds=DictTuple(sounds.values()), ts=ts, language=language) def __len__(self): return len(self.sounds) def __iter__(self): return iter(self.sounds) def __getitem__(self, idx): return self.sounds[idx] def strict_similarity(self, other, aspects=None): aspects = aspects or ["sounds"] scores = [] for aspect in aspects: soundsA, soundsB = ( {sound.grapheme for sound in getattr(self, aspect)}, {sound.grapheme for sound in getattr(other, aspect)}, ) if soundsA or soundsB: scores += [jaccard(soundsA, soundsB)] if not scores: return 0 return statistics.mean(scores) def approximate_similarity(self, other, aspects=None): aspects = aspects or ["sounds"] def approximate(soundsA, soundsB): matches = [] for soundA in soundsA: best_match, best_sim = None, 0 for soundB in soundsB: if soundA.type != "unknownsound" and soundB.type != "unknownsound": current_sim = soundA.similarity(soundB) else: current_sim = 0 if current_sim > best_sim: best_match = soundB best_sim = current_sim if best_match is not None: matches += [best_sim] soundsB = [s for s in soundsB if s != best_match] matches += [0 for s in soundsB] return statistics.mean(matches) scores = [] for aspect in aspects: soundsA, soundsB = ( getattr(self, aspect), getattr(other, aspect), ) if soundsA and soundsB: scores += [ statistics.mean( [approximate(soundsA, soundsB), approximate(soundsB, soundsA)] ) ] elif soundsA or soundsB: scores += [0] if not scores or not sum(scores): return 0 return statistics.mean(scores)