"""
Basic models.
"""
import typing
import statistics
import collections
import attr
import lingpy
from clldutils.misc import lazyproperty as cached_property
import pyclts
from pyclts.models import Sound as CLTSSound, Symbol, Cluster, Consonant
from cltoolkit.util import NestedAttribute, DictTuple, jaccard, MutatedDataValue
[docs]@attr.s(repr=False)
class CLCore:
"""
Base class to represent data in a wordlist.
"""
id = attr.ib()
wordlist = attr.ib(default=None)
data = attr.ib(default=None)
def __repr__(self):
return "<" + self.__class__.__name__ + " " + self.id + ">"
[docs]@attr.s
class WithDataset:
"""
Mixin to represent data in a wordlist from a specific dataset.
"""
obj = attr.ib(default=None, repr=False)
dataset = attr.ib(default=None, repr=False)
[docs]@attr.s(repr=False)
class Language(CLCore, WithForms, WithDataset):
"""
Base class for handling languages.
:ivar senses: `DictTuple` of senses, i.e. glosses for forms.
:ivar concepts: `DictTuple` of senses with explicit Concepticon mapping.
:ivar glottocode: `str`, Glottocode for the language.
.. note::
A language variety is defined for a specific dataset only.
"""
senses = attr.ib(default=None)
concepts = attr.ib(default=None)
glottocode = MutatedDataValue("Glottocode")
name = MutatedDataValue("Name")
macroarea = MutatedDataValue("Macroarea")
latitude = MutatedDataValue("Latitude")
longitude = MutatedDataValue("Longitude")
family = MutatedDataValue("Family")
subgroup = MutatedDataValue("SubGroup")
@cached_property
def sound_inventory(self):
sounds = []
for sound in self.wordlist.sounds:
if self.id in sound.occurrences:
sounds.append(Sound.from_sound(sound, language=self))
return Inventory(language=self, ts=self.wordlist.ts, sounds=DictTuple(sounds))
[docs]@attr.s(repr=False, eq=False)
class Sense(CLCore, WithForms, WithDataset):
"""
A sense description (concept in source) which does not need to be linked to the Concepticon.
:ivar language: :class:`Language` instance
:ivar name: `str`, the gloss
.. note::
Unlike senses in a wordlist, which are dataset-specific, concepts in a wordlist are defined
for all datasets.
"""
language = attr.ib(default=None)
name = MutatedDataValue("Name")
def __repr__(self):
return '<Sense ' + self.id + '>'
def __eq__(self, other):
if isinstance(other, self.__class__):
return self.name == other.name
return False
@classmethod
def from_sense(cls, sense, language, forms):
return cls(
id=sense.id,
data=sense.data,
obj=sense.obj,
forms=forms,
dataset=sense.dataset,
wordlist=sense.wordlist,
language=language)
[docs]@attr.s(repr=False, eq=False)
class Concept(CLCore, WithForms):
"""
Base class for the concepts in a dataset.
:ivar language: :class:`Language` instance
:ivar name: `str`, the gloss
:ivar senses: `iterable` of senses mapped to this concept
:ivar concepticon_id: `str` ID of the Concepticon concept set the concept is mapped to.
:ivar concepticon_gloss: `str` gloss of the Concepticon concept set the concept is mapped to.
.. note::
Unlike senses in a wordlist, which are dataset-specific, concepts in a
wordlist are defined for all datasets. As a result, they lack a
reference to the original dataset in which they occur, but they have an
attribute `senses` which is a reference to the original senses as they
occur in different datasets.
"""
language = attr.ib(default=None)
senses = attr.ib(default=None)
name = attr.ib(default=None)
concepticon_id = attr.ib(default=None)
concepticon_gloss = attr.ib(default=None)
@classmethod
def from_sense(cls, concept, id=None, name=None, forms=None, senses=None):
return cls(
name=name,
id=id,
concepticon_id=concept.data.get("Concepticon_ID", ""),
concepticon_gloss=concept.data.get("Concepticon_Gloss", ""),
forms=forms,
senses=senses
)
@classmethod
def from_concept(cls, concept, forms=None, senses=None):
return cls(
id=concept.id,
name=concept.name,
concepticon_id=concept.concepticon_id,
concepticon_gloss=concept.concepticon_gloss,
senses=senses,
forms=forms,
)
def __repr__(self):
return "<Concept " + self.name + ">"
[docs]@attr.s(repr=False)
class Cognate(CLCore, WithDataset):
form = attr.ib(default=None, repr=False)
contribution = attr.ib(default=None, repr=False)
[docs]@attr.s(repr=False)
class Grapheme(CLCore, WithDataset, WithForms):
grapheme = attr.ib(default=None)
occurrences = attr.ib(default=None)
language = attr.ib(default=None)
def __str__(self):
return self.grapheme
[docs]@attr.s(repr=False, eq=False)
class Sound(CLCore, WithForms):
"""
All sounds in a dataset.
"""
grapheme = attr.ib(default=None)
occurrences = attr.ib(default=None)
graphemes_in_source = attr.ib(default=None)
language = attr.ib(default=None)
obj = attr.ib(default=None)
type = NestedAttribute("obj", "type")
name = NestedAttribute("obj", "name")
featureset = NestedAttribute("obj", "featureset")
@classmethod
def from_grapheme(
cls, grapheme_, grapheme=None, occurrences=None, forms=None,
id=None, graphemes_in_source=None, obj=None):
return cls(
id=id,
grapheme=grapheme,
wordlist=grapheme_.wordlist,
occurrences=occurrences,
data=obj.__dict__,
graphemes_in_source=graphemes_in_source,
forms=forms,
obj=obj)
def __len__(self):
return len(self.occurrences or [])
def __str__(self):
return self.grapheme
def __eq__(self, other):
if isinstance(other, self.__class__):
return self.grapheme == other.grapheme
return False
def __repr__(self):
return "<" + self.__class__.__name__ + " " + self.grapheme + ">"
def similarity(self, other):
if self.type not in ["marker", "unknownsound"] and \
other.type not in ["marker", "unknownsound"]:
return self.obj.similarity(other.obj)
elif self.type in ["marker", "unknownsound"] and other.type in ["marker", "unknownsound"]:
if self == other:
return 1
return 0
return 0
@classmethod
def from_sound(cls, sound, language):
return cls(
id=str(sound),
language=language,
data=sound.data,
obj=sound.obj,
wordlist=sound.wordlist,
grapheme=sound.grapheme,
occurrences=sound.occurrences[language.id],
)
def consonant_or_cluster_attr(self, attribute):
if isinstance(self.obj, Consonant):
return getattr(self.obj, attribute)
if isinstance(self.obj, Cluster):
return getattr(self.obj.from_sound, attribute)
raise AttributeError(attribute)
@property
def manner(self):
return self.consonant_or_cluster_attr('manner')
@property
def place(self):
return self.consonant_or_cluster_attr('place')
@property
def ejection(self):
return self.consonant_or_cluster_attr('ejection')
@property
def airstream(self):
return self.consonant_or_cluster_attr('airstream')
class GetSubInventoryByType:
def __init__(self, types):
def select_sounds(inventory):
return DictTuple([v for v in inventory if v.type in types])
self.select_sounds = select_sounds
def __get__(self, obj, objtype=None):
return self.select_sounds(obj.sounds)
class GetSubInventoryByProperty(GetSubInventoryByType):
def __init__(self, types, properties):
GetSubInventoryByType.__init__(self, types)
self.properties = properties
def __get__(self, obj, objtype=None):
out = []
sounds = self.select_sounds(obj.sounds)
sound_set = set([sound.grapheme for sound in sounds])
for v in sounds:
stripped = obj.ts.features.get(
frozenset([s for s in v.featureset if s not in self.properties])
)
if str(stripped) != str(v) and str(stripped) not in sound_set:
out += [v]
elif str(stripped) == str(v):
out += [v]
return DictTuple(out)
@attr.s
class Inventory:
language = attr.ib(default=None)
sounds = attr.ib(default=None, repr=False)
ts = attr.ib(default=None, repr=False)
consonants = GetSubInventoryByType(["consonant"])
consonants_by_quality = GetSubInventoryByProperty(
["consonant"], ["long", "ultra-long", "mid-long", "ultra-short"]
)
consonant_sounds = GetSubInventoryByType(["consonant", "cluster"])
vowels = GetSubInventoryByType(["vowel"])
vowels_by_quality = GetSubInventoryByProperty(
["vowel"], ["long", "ultra-long", "mid-long", "ultra-short"]
)
vowel_sounds = GetSubInventoryByType(["vowel", "diphthong"])
segments = GetSubInventoryByType(["consonant", "vowel", "cluster", "diphthong"])
tones = GetSubInventoryByType(["tone"])
markers = GetSubInventoryByType(["marker"])
clusters = GetSubInventoryByType(["cluster"])
diphthongs = GetSubInventoryByType(["diphthong"])
unknownsounds = GetSubInventoryByType(["unknownsound"])
@classmethod
def from_list(
cls,
ts: pyclts.TranscriptionSystem,
*list_of_sounds: typing.Union[CLTSSound, Symbol, str],
language=None,
wordlist=None,
):
sounds = collections.OrderedDict()
for itm in list_of_sounds:
sound = ts[itm]
try:
sounds[str(sound)].graphemes_in_source.append(itm)
except KeyError:
sounds[str(sound)] = Sound(
id=str(sound),
obj=sound,
wordlist=wordlist,
grapheme=str(sound),
graphemes_in_source=[sound.grapheme],
occurrences=[],
data=sound.__dict__
)
return cls(sounds=DictTuple(sounds.values()), ts=ts, language=language)
def __len__(self):
return len(self.sounds)
def __iter__(self):
return iter(self.sounds)
def __getitem__(self, idx):
return self.sounds[idx]
def strict_similarity(self, other, aspects=None):
aspects = aspects or ["sounds"]
scores = []
for aspect in aspects:
soundsA, soundsB = (
{sound.grapheme for sound in getattr(self, aspect)},
{sound.grapheme for sound in getattr(other, aspect)},
)
if soundsA or soundsB:
scores += [jaccard(soundsA, soundsB)]
if not scores:
return 0
return statistics.mean(scores)
def approximate_similarity(self, other, aspects=None):
aspects = aspects or ["sounds"]
def approximate(soundsA, soundsB):
matches = []
for soundA in soundsA:
best_match, best_sim = None, 0
for soundB in soundsB:
if soundA.type != "unknownsound" and soundB.type != "unknownsound":
current_sim = soundA.similarity(soundB)
else:
current_sim = 0
if current_sim > best_sim:
best_match = soundB
best_sim = current_sim
if best_match is not None:
matches += [best_sim]
soundsB = [s for s in soundsB if s != best_match]
matches += [0 for s in soundsB]
return statistics.mean(matches)
scores = []
for aspect in aspects:
soundsA, soundsB = (
getattr(self, aspect),
getattr(other, aspect),
)
if soundsA and soundsB:
scores += [
statistics.mean(
[approximate(soundsA, soundsB), approximate(soundsB, soundsA)]
)
]
elif soundsA or soundsB:
scores += [0]
if not scores or not sum(scores):
return 0
return statistics.mean(scores)