Skip to content

Commit a68d170

Browse files
author
Your Name
committed
bug fix + test
1 parent a279acb commit a68d170

File tree

10 files changed

+213
-131
lines changed

10 files changed

+213
-131
lines changed

README.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ gîrodey xałî řeşte; gwêt le neẍmey tuyûre?
4444

4545
Arabic script into the Latin script suggested by Dr. Feryad Fazil Omar:
4646
```python
47-
>>> print(asosoft.Ar2LaF("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"))
47+
>>> print(asosoft.Ar2LaFeryad("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"))
4848
gîrodey xaḻî ṟeşte; gwêt le nex̱mey tuyûre?
4949
```
5050

@@ -144,7 +144,7 @@ Trim starting and ending white spaces (including zero width spaces) of line,
144144
### Replace Html Entities
145145
`ReplaceHtmlEntity` replaces HTML Entities with single Unicode characters (e.g. "é" with "é"). It is useful in web crawled corpora.
146146
```python
147-
>>> print(asosoft.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن"))
147+
>>> print(asosoft.ReplaceHtmlEntity("ئێوە "دەق" بە زمانی <کوردی> دەنووسن"))
148148
ئێوە "دەق" بە زمانی <کوردی> دەنووسن
149149
```
150150
### Replace URLs and emails
@@ -167,8 +167,7 @@ Trim starting and ending white spaces (including zero width spaces) of line,
167167
### Word to Word Replacment
168168
`Word2WordReplacement` applies a "string to string" replacement dictionary on the text. It replaces the full-matched words not a part of them.
169169
```python
170-
>>> dict = {"مال": "ماڵ", "سلاو": "سڵاو"}
171-
>>> print(asosoft.Word2WordReplacement("مال، نووری مالیکی", dict))
170+
>>> print(asosoft.Word2WordReplacement("مال، نووری مالیکی", {"مال": "ماڵ", "سلاو": "سڵاو"}))
172171
ماڵ، نووری مالیکی
173172
```
174173

@@ -193,13 +192,14 @@ Sorting a string list in correct order of Kurdish alphabet ("ئءاآأإبپت
193192
```python
194193
>>> myList = ["یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"]
195194
>>> print(asosoft.KurdishSort(myList))
196-
"ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"
195+
["ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"]
197196
```
198197
or using your custom order:
199198
```python
200199
>>> inputList = ["یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"]
201-
>>> inputOrder = list(["ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ"])
200+
>>> inputOrder = list("ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ")
202201
>>> print(asosoft.CustomSort(inputList, inputOrder))
202+
["ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"]
203203
```
204204
## Poem Meter Classifier
205205
It classifies the meter of the input Kurdish poem typed in Arabic script. The lines of the poem should be seprated by new line char ('\n').
@@ -208,5 +208,7 @@ You can find Kurdish poems in https://books.vejin.net/.
208208
>>> poem = "گەرچی تووشی ڕەنجەڕۆیی و حەسرەت و دەردم ئەمن\nقەت لەدەس ئەم چەرخە سپڵە نابەزم مەردم ئەمن\nئاشقی چاوی کەژاڵ و گەردنی پڕ \nخاڵ نیم\nئاشقی کێو و تەلان و بەندەن و بەردم ئەمن"
209209
>>> classified = asosoft.ClassifyKurdishPoem(poem)
210210
>>> print("Poem Type= " + classified.overalMeterType)
211+
Quantitative/عەرووزی
211212
>>> print("Poem Meter= " + classified.overalPattern)
213+
فاعلاتن فاعلاتن فاعلاتن فاعلن
212214
```

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="asosoft",
9-
version="0.1.3",
9+
version="0.2.0",
1010
description="AsoSoft's Library for Kurdish language processing tasks",
1111
keywords='natural-language-processing, normalization, unicode-normalization, central-kurdish, kurdish, sorani',
1212
package_dir={'': 'src'},

src/asosoft/G2P.py

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from collections import OrderedDict
2525

2626
# Normalizion
27-
def _G2PNormalize(text):
27+
def G2P_normalize(text):
2828
s = [
2929
" +", " " ,
3030
"دٚ", "ڎ",
@@ -54,37 +54,37 @@ def _G2PNormalize(text):
5454
text = re.sub(s[i], s[i + 1], text)
5555
return text
5656

57-
_History = {}
58-
_path = os.path.dirname(__file__)
59-
_G2PExceptions = {}
60-
_G2PCertain = {}
61-
def _load_replaces():
62-
with open(os.path.join(_path, "resources/G2PExceptions.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
57+
history = {}
58+
path = os.path.dirname(__file__)
59+
G2P_exceptions = {}
60+
G2P_certain = {}
61+
def load_replaces():
62+
with open(os.path.join(path, "resources/G2PExceptions.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
6363
reader = csv.reader(csvfile)
6464
next(reader) # Skip the first row
6565
for row in reader:
66-
_G2PExceptions[row[0]] = row[1]
66+
G2P_exceptions[row[0]] = row[1]
6767

68-
with open(os.path.join(_path, "resources/G2PCertain.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
68+
with open(os.path.join(path, "resources/G2PCertain.csv"), 'r', encoding="utf-8", newline='\n') as csvfile:
6969
reader = csv.reader(csvfile)
7070
next(reader) # Skip the first row
7171
for row in reader:
72-
_G2PCertain[row[0]] = row[1]
72+
G2P_certain[row[0]] = row[1]
7373

7474

7575
# GEN: generates all possible candidates:
7676
# e.g. بوون => bûn, buwn, bwun
7777

78-
def _Generator(gr):
79-
if len(_G2PExceptions) == 0:
80-
_load_replaces()
78+
def Generator(gr):
79+
if len(G2P_exceptions) == 0:
80+
load_replaces()
8181

8282
# Converting exceptional words
83-
for key, value in _G2PExceptions.items():
83+
for key, value in G2P_exceptions.items():
8484
gr = re.sub(key, value, gr)
8585

8686
# Converting certain characters
87-
for key, value in _G2PCertain.items():
87+
for key, value in G2P_certain.items():
8888
gr = re.sub(key, value, gr)
8989

9090
# Uncertainty in "و" and "ی"
@@ -132,10 +132,10 @@ def _Generator(gr):
132132
CandList1.append(TempList[i] + temp[j])
133133

134134
# Adding "i" between Consonant Clusters
135-
Candidates = _iInsertion(CandList1)
135+
Candidates = i_insertion(CandList1)
136136

137137
# ======= Syllabification for each candidate
138-
OutputCandidates = _Syllabification(Candidates)
138+
OutputCandidates = syllabification(Candidates)
139139

140140
# for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda
141141
cCount = len(OutputCandidates)
@@ -150,7 +150,7 @@ def _Generator(gr):
150150

151151
# insertion of hidden /i/ vowel
152152
# e.g. brd => bird, brid, birid
153-
def _iInsertion(Cands):
153+
def i_insertion(Cands):
154154
Candidates = []
155155
for i in range(len(Cands)):
156156
ThisCand = []
@@ -171,7 +171,7 @@ def _iInsertion(Cands):
171171

172172
# Syllabification of candidates
173173
# e.g. dexom => ˈdeˈxom
174-
def _Syllabification(Candidates):
174+
def syllabification(Candidates):
175175
cCount = len(Candidates)
176176
for i in range(cCount):
177177
# Onset C(C)V
@@ -184,7 +184,7 @@ def _Syllabification(Candidates):
184184
return Candidates
185185

186186
# Sonority Sequencing Principle in EVAL needs phoneme ranking
187-
def _SonorityIndex(ch):
187+
def sonority_index(ch):
188188
c = str(ch)
189189
if re.search(r"[wy]", c): # Approximant
190190
return 6
@@ -201,7 +201,7 @@ def _SonorityIndex(ch):
201201

202202

203203
# EVAL: specifies a penalty number for each syllabified candidate
204-
def _EVAL(Candidates):
204+
def EVAL(Candidates):
205205
output = {}
206206
if len(Candidates) > 0:
207207
Penalty = {}
@@ -222,7 +222,7 @@ def _EVAL(Candidates):
222222
for coda in codas:
223223
chars = coda
224224
for j in range(len(chars) - 1):
225-
if _SonorityIndex(chars[j]) <= _SonorityIndex(chars[j + 1]):
225+
if sonority_index(chars[j]) <= sonority_index(chars[j + 1]):
226226
P += 10
227227
# DEP: i insertion
228228
P += candidate.count("i") * 2
@@ -248,6 +248,7 @@ def _EVAL(Candidates):
248248
P += candidate.count("wi") * 2
249249
P += candidate.count("iw") * 2
250250
P += candidate.count("wû") * 5
251+
P += candidate.count("uˈwî") * 1
251252

252253
# ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî)
253254
# ˈdiˈrêjˈyî => ˈdiˈrêˈjîy
@@ -286,47 +287,47 @@ def _EVAL(Candidates):
286287
pat = re.search(r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])", candidate)
287288
if pat:
288289
C = re.sub("[iˈ]", "", pat.group())
289-
if _SonorityIndex(C[1]) > _SonorityIndex(C[2]):
290+
if sonority_index(C[1]) > sonority_index(C[2]):
290291
P += 3
291292
# ('sern'cê => 'se'rin'cê)
292293
pat = re.search(r"([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])", candidate)
293294
if pat:
294295
C = re.sub("[iˈ]", "", pat.group())
295-
if _SonorityIndex(C[0]) > _SonorityIndex(C[1]):
296+
if sonority_index(C[0]) > sonority_index(C[1]):
296297
P += 3
297298
# ('ser'ni'cê => 'se'rin'cê)
298299
pat = re.search(r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])", candidate)
299300
if pat:
300301
C = re.sub("[iˈ]", "", pat.group())
301-
if _SonorityIndex(C[0]) > _SonorityIndex(C[1]) and _SonorityIndex(C[1]) > _SonorityIndex(C[2]):
302+
if sonority_index(C[0]) > sonority_index(C[1]) and sonority_index(C[1]) > sonority_index(C[2]):
302303
P += 3
303304
# ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne)
304305
pat = re.search(r"[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])", candidate)
305306
if pat:
306307
C = re.sub("[aeêouûiîˈ]", "", pat.group())
307-
if _SonorityIndex(C[2]) >= _SonorityIndex(C[1]):
308+
if sonority_index(C[2]) >= sonority_index(C[1]):
308309
P += 3
309310
Penalty[candidate] = P
310311

311312
output = OrderedDict(sorted(Penalty.items(), key=lambda x: x[1]))
312313
return output
313314

314315
# chooses the best candidates for the word
315-
def _Evaluator(gr, Candidates):
316+
def evaluator(gr, Candidates):
316317
Output = []
317-
evaluatedCandidates = _EVAL(Candidates)
318+
evaluatedCandidates = EVAL(Candidates)
318319
if len(evaluatedCandidates) > 0:
319320
LowestPenalt = list(evaluatedCandidates.values())[0]
320321
for key, value in evaluatedCandidates.items():
321322
if value < LowestPenalt + 5:
322323
Output.append(key)
323324
return gr if len(Output) == 0 else '¶'.join(Output)
324325

325-
def _WordG2P(gr, SingleOutputPerWord):
326+
def word_G2P(gr, SingleOutputPerWord):
326327
# Check history for speed up
327-
if gr not in _History:
328-
_History[gr] = _Evaluator(gr, _Generator(gr))
329-
return _History[gr].split('¶')[0] if SingleOutputPerWord else _History[gr]
328+
if gr not in history:
329+
history[gr] = evaluator(gr, Generator(gr))
330+
return history[gr].split('¶')[0] if SingleOutputPerWord else history[gr]
330331

331332
# Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems)
332333
def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, singleOutputPerWord=True):
@@ -335,13 +336,13 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
335336
if convertNumbersToWord:
336337
text = Number2Word(text)
337338

338-
text = _G2PNormalize(text.strip())
339+
text = G2P_normalize(text.strip())
339340

340341
ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ"
341342
wordss = re.findall(f"([{ku}]+|[^{ku}]+)", text)
342343
for word in wordss:
343344
if re.search(f"[{ku}]", word) and word != "و":
344-
sb.append(_WordG2P(re.sub(f"[^{ku}]+", "", word), singleOutputPerWord))
345+
sb.append(word_G2P(re.sub(f"[^{ku}]+", "", word), singleOutputPerWord))
345346
else:
346347
sb.append(word)
347348
output = ''.join(sb)
@@ -366,4 +367,4 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
366367
# if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû)
367368
output = re.sub(r"(\w+)¶\1(\s|$)", r"\1", output)
368369

369-
return output.rstrip()
370+
return output.rstrip()

0 commit comments

Comments
 (0)