import unittest
BASE = 1
VOWEL = 2
SHIFTER = 4
COENG = 8
SIGN = 16
LEFT = 32
WITHE = 64
WITHU = 128
POSRAA = 256
MUUS = 512
TRII = 1024
ROBAT = 2048
RO = unichr(0x179A)
PO = unichr(0x1796)
SRAAA = unichr(0x17B6)
SRAE = unichr(0x17C1)
SRAOE = unichr(0x17BE)
SRAOO = unichr(0x17C4)
SRAYA = unichr(0x17BF)
SRAIE = unichr(0x17C0)
SRAAU = unichr(0x17C5)
SRAII = unichr(0x17B8)
SRAU = unichr(0x17BB)
TRIISAP = unichr(0x17CA)
MUUSIKATOAN = unichr(0x17C9)
SA = unichr(0x179F)
SAMYOKSANNYA = unichr(0x17D0)
NYO = unichr(0x1789)
ZWSP = unichr(0x200B)
sraEcombining = {
SRAII:SRAOE,
SRAYA:SRAYA,
SRAIE:SRAIE,
SRAAA:SRAOO,
SRAAU:SRAAU
}
KHMERCHAR = [
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE + MUUS,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE + POSRAA,
BASE,
BASE,
BASE,
BASE + POSRAA,
BASE + MUUS,
BASE,
BASE,
BASE + POSRAA,
BASE,
BASE + POSRAA,
BASE + POSRAA,
BASE + POSRAA,
BASE + POSRAA,
BASE,
BASE,
BASE + TRII,
BASE,
BASE,
BASE + TRII,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
BASE,
0, 0,
VOWEL + WITHE + WITHU,
VOWEL + WITHU,
VOWEL + WITHE + WITHU,
VOWEL + WITHU,
VOWEL + WITHU,
VOWEL,
VOWEL,
VOWEL,
VOWEL + WITHU,
VOWEL + WITHE,
VOWEL + WITHE,
VOWEL + LEFT,
VOWEL + LEFT,
VOWEL + LEFT,
VOWEL,
VOWEL + WITHE,
SIGN + WITHU,
SIGN,
SIGN,
SHIFTER,
SHIFTER,
SIGN,
ROBAT,
SIGN,
SIGN,
SIGN,
SIGN + WITHU,
SIGN,
COENG,
SIGN
]
def khmerType(uniChar):
"""input one unicode character;
output an integer which is the Khmer type of the character or 0"""
if (type(uniChar) != unicode):
raise TypeError('only accept one character')
if (len(uniChar) != 1):
raise TypeError('only accept one character, but ' + str(len(uniChar)) + ' chars found.')
ch = ord(uniChar[0])
if (ch >= 0x1780):
ch -= 0x1780
if (ch < len(KHMERCHAR)):
return KHMERCHAR[ch]
return 0
def reorder(sin):
"""
take khmer unicode string in visual-based cluster and return the rule-based
cluster based on:
baseCharacter [+ [Robat/Shifter] + [Coeng*] + [Shifter] + [Vowel] + [Sign]]
and if the input is not unicode, return what it is input.
"""
if (type(sin) != unicode):
raise TypeError('only accept unicode string')
result = u''
sinLimit = len(sin)-1
i = -1
while i < sinLimit:
baseChar = ''
robat = ''
shifter1 = ''
shifter2 = ''
coeng1 = ''
coeng2 = ''
vowel = ''
poSraA = False
sign = ''
keep = ''
cluster = ''
while i < sinLimit:
i += 1
sinType = khmerType(sin[i])
if (sinType & BASE):
if (baseChar):
i -= 1
break
baseChar = sin[i]
keep = ''
continue
elif (sinType & ROBAT):
if (robat):
i -= 1
break
robat = sin[i]
keep = ''
continue
elif (sinType & SHIFTER):
if (shifter1):
i -= 1
break
shifter1 = sin[i]
keep = ''
continue
elif (sinType & SIGN):
if (sign):
i -= 1
break
sign = sin[i]
keep = ''
continue
elif (sinType & COENG):
if (i == sinLimit):
coeng1 = sin[i]
break
if ((sin[i+1] == RO) and (baseChar)):
i -= 1
break
if (coeng1 == ''):
coeng1 = sin[i : i+2]
i += 1
keep = ''
elif (coeng1[1] == RO):
coeng2 = sin[i : i+2]
i += 1
keep = ''
else:
i -= 1
break
elif (sinType & VOWEL):
if (vowel == ''):
if ((sinType & LEFT) and (baseChar)):
i -= 1
break
vowel = sin[i]
keep = ''
elif ((baseChar == PO) and (not poSraA) and ((sin[i] == SRAAA)
or (vowel == SRAAA))):
poSraA = True
if vowel == SRAAA:
vowel = sin[i]
keep = ''
else:
if (vowel == SRAE) and (sinType & WITHE):
vowel = sraEcombining[sin[i]]
keep = ''
elif ((vowel == SRAU and (sinType & WITHU)) or
((khmerType(vowel) & WITHU) and sin[i] == SRAU)):
if (not(khmerType(vowel) & WITHU)):
vowel = sin[i]
if (baseChar and (khmerType(baseChar) & TRII)):
shifter1 = TRIISAP
else:
shifter1 = MUUSIKATOAN
elif (vowel == SRAE) and (sin[i] == SRAU):
if (baseChar and (khmerType(baseChar) & TRII)):
shifter1 = TRIISAP
else:
shifter1 = MUUSIKATOAN
else:
i -= 1
break
else:
if (sin[i] == ZWSP):
keep = ZWSP
else:
keep = sin[i]
break
if ((vowel == SRAU) and (sign) and (khmerType(sign) & WITHU)):
if (sign == SAMYOKSANNYA):
vowel = ''
shifter1 = MUUSIKATOAN
if (shifter1 and coeng1):
if (khmerType(coeng1[1]) & TRII):
shifter2 = TRIISAP
shifter1 = ''
elif (khmerType(coeng1[1]) & MUUS):
shifter2 = MUUSIKATOAN
shifter1 = ''
underPoSraA = coeng2 or coeng1
if (len(underPoSraA) == 2):
underPoSraA = khmerType(underPoSraA[1]) & POSRAA
if ((poSraA and (not underPoSraA) and vowel) or ((baseChar == PO)
and (vowel == SRAAA) and (not underPoSraA))):
baseChar = NYO
if ((vowel == SRAAA) and (not poSraA)):
vowel = ''
if ((poSraA) and (vowel == SRAE)):
vowel = SRAOO
cluster = baseChar + robat + shifter1 + coeng2 + coeng1 + shifter2 + vowel + sign
result = result + cluster + keep
return result
class TestReordering(unittest.TestCase):
def testKhmerType(self):
self.assertEqual(khmerType(unichr(0x177F)), 0)
self.assertEqual(khmerType(unichr(0x1780)) & BASE, BASE)
self.assertEqual(khmerType(unichr(0x17B6)), VOWEL + WITHE + WITHU)
self.assertEqual(khmerType(unichr(0x17C9)), SHIFTER)
self.assertEqual(khmerType(unichr(0x17CB)), SIGN)
self.assertEqual(khmerType(unichr(0x17D4)), 0)
self.assertEqual(khmerType(unichr(0x17ff)), 0)
def testReorder(self):
self.assertEqual(reorder(u'កករ'), u'កករ')
self.assertEqual(reorder(u'បា៉'), u'ប៉ា')
self.assertEqual(reorder(u'បូ៊'), u'ប៊ូ')
self.assertEqual(reorder(u'របំា'), u'របាំ')
self.assertEqual(reorder(u'របាំ'), u'របាំ')
self.assertEqual(reorder(u'្រកដាស្របដាល់កណ្ដាល'), u'ក្រដាសប្រដាល់កណ្ដាល')
self.assertEqual(reorder(u'បេង្គាល ខាងេលី េសៀវេភៅ'), u'បង្គោល ខាងលើ សៀវភៅ')
self.assertEqual(reorder(u'សីុបុីអុី'), u'ស៊ីប៉ីអ៊ី')
self.assertEqual(reorder(u'ន្សីុ'), u'ន្ស៊ី')
self.assertEqual(reorder(u'េគ្របែឡងគ្នា'), u'គេប្រឡែងគ្នា')
self.assertEqual(reorder(u'បពា្ញា'), u'បញ្ញា')
self.assertEqual(reorder(u'បព្ជាី'), u'បញ្ជី')
self.assertEqual(reorder(u'កេ្រព្ជាាង'), u'កញ្ជ្រោង')
self.assertEqual(reorder(u'this is english text'), u'this is english text')
self.assertEqual(reorder(u'ចំេពាះ'), u'ចំពោះ')
self.assertEqual(reorder(u'្របឹក្សាធម្មនុពា្ញ'), u'ប្រឹក្សាធម្មនុញ្ញ')
self.assertEqual(reorder(u'ៃហប៊ី'), u'ហៃប៊ី')
self.assertEqual(reorder(u'បូពា៌'), u'បូព៌ា')
self.assertEqual(reorder(u'បានេស្នី'), u'បានស្នើ')
self.assertEqual(reorder(u'្K្ក្េ'), u'្K្ក្េ')
self.assertEqual(reorder(u'្'), u'្')
self.assertEqual(reorder(u'រ'+unichr(0x200B)+u'ដ្ឋ'+unichr(0x200B)+u'ាភិប'+unichr(0x200B)
+u'ាល'), u'រ'+unichr(0x200B)+u'ដ្ឋាភិបាល')
self.assertEqual(reorder(u'េ្របី'), u'ប្រើ')
self.assertEqual(reorder(u'ប្បុ័ង'), u'ប្ប៉័ង')
self.assertEqual(reorder(u'ប្ប័ុង'), u'ប្ប៉័ង')
self.assertEqual(reorder(u'េសីុប'), u'ស៊ើប')
self.assertEqual(reorder(u'េបីុង'), u'ប៉ើង')
self.assertEqual(reorder(u'េសុីប'), u'ស៊ើប')
self.assertEqual(reorder(u'កំុេជា'), u'កុំជោ')
def testShifter(self):
self.assertEqual(reorder(u'៊៊'), u'៊៊')
def testSign(self):
self.assertEqual(reorder(u'ះះ'), u'ះះ')
def testAllCase(self):
self.assertEqual(reorder(u'កង'), u'កង')
self.assertEqual(reorder(u'៌៌'), u'៌៌')
self.assertEqual(reorder(u'៉៊'), u'៉៊')
self.assertEqual(reorder(u'ប្រក'), u'បក្រ')
self.assertEqual(reorder(u'្រស្ត'), u'ស្ត្រ')
def testKhmerTypeError(self):
self.assertRaises(TypeError, khmerType, 'KA')
self.assertRaises(TypeError, khmerType, 1)
self.assertRaises(TypeError, khmerType, {1:1})
def testReorderError(self):
self.assertRaises(TypeError, reorder, 'this is ansi')
if __name__ == '__main__':
unittest.main()