24
24
from collections import OrderedDict
25
25
26
26
# Normalizion
27
- def _G2PNormalize (text ):
27
+ def G2P_normalize (text ):
28
28
s = [
29
29
" +" , " " ,
30
30
"دٚ" , "ڎ" ,
@@ -54,37 +54,37 @@ def _G2PNormalize(text):
54
54
text = re .sub (s [i ], s [i + 1 ], text )
55
55
return text
56
56
57
- _History = {}
58
- _path = os .path .dirname (__file__ )
59
- _G2PExceptions = {}
60
- _G2PCertain = {}
61
- def _load_replaces ():
62
- with open (os .path .join (_path , "resources/G2PExceptions.csv" ), 'r' , encoding = "utf-8" , newline = '\n ' ) as csvfile :
57
+ history = {}
58
+ path = os .path .dirname (__file__ )
59
+ G2P_exceptions = {}
60
+ G2P_certain = {}
61
+ def load_replaces ():
62
+ with open (os .path .join (path , "resources/G2PExceptions.csv" ), 'r' , encoding = "utf-8" , newline = '\n ' ) as csvfile :
63
63
reader = csv .reader (csvfile )
64
64
next (reader ) # Skip the first row
65
65
for row in reader :
66
- _G2PExceptions [row [0 ]] = row [1 ]
66
+ G2P_exceptions [row [0 ]] = row [1 ]
67
67
68
- with open (os .path .join (_path , "resources/G2PCertain.csv" ), 'r' , encoding = "utf-8" , newline = '\n ' ) as csvfile :
68
+ with open (os .path .join (path , "resources/G2PCertain.csv" ), 'r' , encoding = "utf-8" , newline = '\n ' ) as csvfile :
69
69
reader = csv .reader (csvfile )
70
70
next (reader ) # Skip the first row
71
71
for row in reader :
72
- _G2PCertain [row [0 ]] = row [1 ]
72
+ G2P_certain [row [0 ]] = row [1 ]
73
73
74
74
75
75
# GEN: generates all possible candidates:
76
76
# e.g. بوون => bûn, buwn, bwun
77
77
78
- def _Generator (gr ):
79
- if len (_G2PExceptions ) == 0 :
80
- _load_replaces ()
78
+ def Generator (gr ):
79
+ if len (G2P_exceptions ) == 0 :
80
+ load_replaces ()
81
81
82
82
# Converting exceptional words
83
- for key , value in _G2PExceptions .items ():
83
+ for key , value in G2P_exceptions .items ():
84
84
gr = re .sub (key , value , gr )
85
85
86
86
# Converting certain characters
87
- for key , value in _G2PCertain .items ():
87
+ for key , value in G2P_certain .items ():
88
88
gr = re .sub (key , value , gr )
89
89
90
90
# Uncertainty in "و" and "ی"
@@ -132,10 +132,10 @@ def _Generator(gr):
132
132
CandList1 .append (TempList [i ] + temp [j ])
133
133
134
134
# Adding "i" between Consonant Clusters
135
- Candidates = _iInsertion (CandList1 )
135
+ Candidates = i_insertion (CandList1 )
136
136
137
137
# ======= Syllabification for each candidate
138
- OutputCandidates = _Syllabification (Candidates )
138
+ OutputCandidates = syllabification (Candidates )
139
139
140
140
# for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda
141
141
cCount = len (OutputCandidates )
@@ -150,7 +150,7 @@ def _Generator(gr):
150
150
151
151
# insertion of hidden /i/ vowel
152
152
# e.g. brd => bird, brid, birid
153
- def _iInsertion (Cands ):
153
+ def i_insertion (Cands ):
154
154
Candidates = []
155
155
for i in range (len (Cands )):
156
156
ThisCand = []
@@ -171,7 +171,7 @@ def _iInsertion(Cands):
171
171
172
172
# Syllabification of candidates
173
173
# e.g. dexom => ˈdeˈxom
174
- def _Syllabification (Candidates ):
174
+ def syllabification (Candidates ):
175
175
cCount = len (Candidates )
176
176
for i in range (cCount ):
177
177
# Onset C(C)V
@@ -184,7 +184,7 @@ def _Syllabification(Candidates):
184
184
return Candidates
185
185
186
186
# Sonority Sequencing Principle in EVAL needs phoneme ranking
187
- def _SonorityIndex (ch ):
187
+ def sonority_index (ch ):
188
188
c = str (ch )
189
189
if re .search (r"[wy]" , c ): # Approximant
190
190
return 6
@@ -201,7 +201,7 @@ def _SonorityIndex(ch):
201
201
202
202
203
203
# EVAL: specifies a penalty number for each syllabified candidate
204
- def _EVAL (Candidates ):
204
+ def EVAL (Candidates ):
205
205
output = {}
206
206
if len (Candidates ) > 0 :
207
207
Penalty = {}
@@ -222,7 +222,7 @@ def _EVAL(Candidates):
222
222
for coda in codas :
223
223
chars = coda
224
224
for j in range (len (chars ) - 1 ):
225
- if _SonorityIndex (chars [j ]) <= _SonorityIndex (chars [j + 1 ]):
225
+ if sonority_index (chars [j ]) <= sonority_index (chars [j + 1 ]):
226
226
P += 10
227
227
# DEP: i insertion
228
228
P += candidate .count ("i" ) * 2
@@ -248,6 +248,7 @@ def _EVAL(Candidates):
248
248
P += candidate .count ("wi" ) * 2
249
249
P += candidate .count ("iw" ) * 2
250
250
P += candidate .count ("wû" ) * 5
251
+ P += candidate .count ("uˈwî" ) * 1
251
252
252
253
# ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî)
253
254
# ˈdiˈrêjˈyî => ˈdiˈrêˈjîy
@@ -286,47 +287,47 @@ def _EVAL(Candidates):
286
287
pat = re .search (r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])" , candidate )
287
288
if pat :
288
289
C = re .sub ("[iˈ]" , "" , pat .group ())
289
- if _SonorityIndex (C [1 ]) > _SonorityIndex (C [2 ]):
290
+ if sonority_index (C [1 ]) > sonority_index (C [2 ]):
290
291
P += 3
291
292
# ('sern'cê => 'se'rin'cê)
292
293
pat = re .search (r"([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])" , candidate )
293
294
if pat :
294
295
C = re .sub ("[iˈ]" , "" , pat .group ())
295
- if _SonorityIndex (C [0 ]) > _SonorityIndex (C [1 ]):
296
+ if sonority_index (C [0 ]) > sonority_index (C [1 ]):
296
297
P += 3
297
298
# ('ser'ni'cê => 'se'rin'cê)
298
299
pat = re .search (r"([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])" , candidate )
299
300
if pat :
300
301
C = re .sub ("[iˈ]" , "" , pat .group ())
301
- if _SonorityIndex (C [0 ]) > _SonorityIndex (C [1 ]) and _SonorityIndex (C [1 ]) > _SonorityIndex (C [2 ]):
302
+ if sonority_index (C [0 ]) > sonority_index (C [1 ]) and sonority_index (C [1 ]) > sonority_index (C [2 ]):
302
303
P += 3
303
304
# ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne)
304
305
pat = re .search (r"[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])" , candidate )
305
306
if pat :
306
307
C = re .sub ("[aeêouûiîˈ]" , "" , pat .group ())
307
- if _SonorityIndex (C [2 ]) >= _SonorityIndex (C [1 ]):
308
+ if sonority_index (C [2 ]) >= sonority_index (C [1 ]):
308
309
P += 3
309
310
Penalty [candidate ] = P
310
311
311
312
output = OrderedDict (sorted (Penalty .items (), key = lambda x : x [1 ]))
312
313
return output
313
314
314
315
# chooses the best candidates for the word
315
- def _Evaluator (gr , Candidates ):
316
+ def evaluator (gr , Candidates ):
316
317
Output = []
317
- evaluatedCandidates = _EVAL (Candidates )
318
+ evaluatedCandidates = EVAL (Candidates )
318
319
if len (evaluatedCandidates ) > 0 :
319
320
LowestPenalt = list (evaluatedCandidates .values ())[0 ]
320
321
for key , value in evaluatedCandidates .items ():
321
322
if value < LowestPenalt + 5 :
322
323
Output .append (key )
323
324
return gr if len (Output ) == 0 else '¶' .join (Output )
324
325
325
- def _WordG2P (gr , SingleOutputPerWord ):
326
+ def word_G2P (gr , SingleOutputPerWord ):
326
327
# Check history for speed up
327
- if gr not in _History :
328
- _History [gr ] = _Evaluator (gr , _Generator (gr ))
329
- return _History [gr ].split ('¶' )[0 ] if SingleOutputPerWord else _History [gr ]
328
+ if gr not in history :
329
+ history [gr ] = evaluator (gr , Generator (gr ))
330
+ return history [gr ].split ('¶' )[0 ] if SingleOutputPerWord else history [gr ]
330
331
331
332
# Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems)
332
333
def KurdishG2P (text , convertNumbersToWord = False , backMergeConjunction = True , singleOutputPerWord = True ):
@@ -335,13 +336,13 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
335
336
if convertNumbersToWord :
336
337
text = Number2Word (text )
337
338
338
- text = _G2PNormalize (text .strip ())
339
+ text = G2P_normalize (text .strip ())
339
340
340
341
ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ"
341
342
wordss = re .findall (f"([{ ku } ]+|[^{ ku } ]+)" , text )
342
343
for word in wordss :
343
344
if re .search (f"[{ ku } ]" , word ) and word != "و" :
344
- sb .append (_WordG2P (re .sub (f"[^{ ku } ]+" , "" , word ), singleOutputPerWord ))
345
+ sb .append (word_G2P (re .sub (f"[^{ ku } ]+" , "" , word ), singleOutputPerWord ))
345
346
else :
346
347
sb .append (word )
347
348
output = '' .join (sb )
@@ -366,4 +367,4 @@ def KurdishG2P(text, convertNumbersToWord=False, backMergeConjunction=True, sing
366
367
# if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû)
367
368
output = re .sub (r"(\w+)¶\1(\s|$)" , r"\1" , output )
368
369
369
- return output .rstrip ()
370
+ return output .rstrip ()
0 commit comments