dkavraal/türkçe.py

## türkçe.py
"""
Letter Codes
------------------------

LETTER  Windows-1254    ISO-8859-9  latin5      UTF-8
ğ       b'\xf0'         b'\xf0'     b'\xf0'     b'\xc4\x9f'
ı       b'\xfd'         b'\xfd'     b'\xfd'     b'\xc4\xb1'
ş       b'\xfe'         b'\xfe'     b'\xfe'     b'\xc5\x9f'
ü       b'\xfc'         b'\xfc'     b'\xfc'     b'\xc3\xbc'
ö       b'\xf6'         b'\xf6'     b'\xf6'     b'\xc3\xb6'
ç       b'\xe7'         b'\xe7'     b'\xe7'     b'\xc3\xa7'
Ğ       b'\xd0'         b'\xd0'     b'\xd0'     b'\xc4\x9e'
Ü       b'\xdc'         b'\xdc'     b'\xdc'     b'\xc3\x9c'
Ş       b'\xde'         b'\xde'     b'\xde'     b'\xc5\x9e'
İ       b'\xdd'         b'\xdd'     b'\xdd'     b'\xc4\xb0'
Ö       b'\xd6'         b'\xd6'     b'\xd6'     b'\xc3\x96'
Ç       b'\xc7'         b'\xc7'     b'\xc7'     b'\xc3\x87'


How to distinguish
------------------------

##1

for i in "ığşüöçĞÜŞİÖÇ":
     try:
             print(i, str(i.encode("utf-8").decode("cp1254")))
     except:
             print(i, "error")

ı Ä±
ğ ÄŸ
ş ÅŸ
ü Ã¼
ö Ã¶
ç Ã§
Ğ error
Ü Ãœ
Ş error
İ Ä°
Ö Ã–
Ç Ã‡

##2

b'\xc4'.decode("cp1254")

'Ä'

##3

>>> for i in "ığşüöçĞÜŞİÖÇ":
...     print(i, str(i.encode("utf-8")), i.encode("utf-8")[0], i.encode("utf-8")[1], chr(i.encode("utf-8")[0]), chr(i.encode("utf-8")[1]), bytearray([ i.encode("utf-8")[0] ]).decode("cp1254"), bytearray([ i.encode("utf-8")[1] ]).decode("cp1254"))
...
ı   b'\xc4\xb1'     196     177     Ä   ±       Ä   ±
ğ   b'\xc4\x9f'     196     159     Ä         Ä   Ÿ
ş   b'\xc5\x9f'     197     159     Å         Å   Ÿ
ü   b'\xc3\xbc'     195     188     Ã   ¼       Ã   ¼
ö   b'\xc3\xb6'     195     182     Ã   ¶       Ã   ¶
ç   b'\xc3\xa7'     195     167     Ã   §       Ã   §
Ğ   b'\xc4\x9e'     196     158     Ä         Ä
Ü   b'\xc3\x9c'     195     156     Ã         Ã   œ
Ş   b'\xc5\x9e'     197     158     Å         Å
İ   b'\xc4\xb0'     196     176     Ä   °       Ä   °
Ö   b'\xc3\x96'     195     150     Ã         Ã   –
Ç   b'\xc3\x87'     195     135     Ã         Ã   ‡

            actually this code:
            for i in "ığşüöçĞÜŞİÖÇ":
                try:
                    print(i, "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(str(i.encode("utf-8")), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(i.encode("utf-8")[0], "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(i.encode("utf-8")[1], "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(chr(i.encode("utf-8")[0]), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(chr(i.encode("utf-8")[1]), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(bytearray([ i.encode("utf-8")[0] ]).decode("cp1254"), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(bytearray([ i.encode("utf-8")[1] ]).decode("cp1254"), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(bytearray([ i.encode("utf-8")[0] ]).decode("ISO-8859-9"), "\t", end="")
                except:
                    print("#\t", end="")
                try:
                    print(bytearray([ i.encode("utf-8")[1] ]).decode("ISO-8859-9"), "\t", end="")
                except:
                    print("#\t", end="")
                print()


##4
So,

Real   UTF8         ord(x)          char(x)         cp1254(x)
Chr    Encode       #1      #2      #1  #2          #1  #2
ı   b'\xc4\xb1'     196     177     Ä   ±           Ä   ±
ğ   b'\xc4\x9f'     196     159     Ä   <THIS!>     Ä   Ÿ
ş   b'\xc5\x9f'     197     159     Å   <THIS!>     Å   Ÿ
ü   b'\xc3\xbc'     195     188     Ã   ¼           Ã   ¼
ö   b'\xc3\xb6'     195     182     Ã   ¶           Ã   ¶
ç   b'\xc3\xa7'     195     167     Ã   §           Ã   §
Ğ   b'\xc4\x9e'     196     158     Ä   <THIS!>     Ä   <THIS!>
Ü   b'\xc3\x9c'     195     156     Ã   <THIS!>     Ã   œ
Ş   b'\xc5\x9e'     197     158     Å   <THIS!>     Å   <THIS!>
İ   b'\xc4\xb0'     196     176     Ä   °           Ä   °
Ö   b'\xc3\x96'     195     150     Ã   <THIS!>     Ã   –
Ç   b'\xc3\x87'     195     135     Ã   <THIS!>     Ã   ‡


##5

MAYBE(m), SURE (s)

         NOT cp1254     NOT ISO-8859-9      cp1254      ISO-8859-9     UTF8      NOT UTF8   NOT-ASCII
\x9e     s              s                                              s                    s
\x9f     s              s                                              m                    s
\x9c
\x96                    s                   m                          m                    s
\x87                    s                   m                          m                    s


##6
if the text(as bytearray) contains b'\x9e':
    it contains "Ş" or "Ğ" ## nearly impossible it is "Ÿ"
    it is UTF-8
so on so forth


"""

def findAny_TR_UTF8Chars(inText):
    """
    ! PRECONDITION !: be sure text is in Turkish, it may get confused with Icelandic or etc // not tested w/those
    This method simply checks if any of these chars exists in the byte array (to verify it is UTF8-Turkish)
    This way you can understand if it is UTF-8 (True) or Windows-1254 (False) encoded

    Real   UTF8            ord(x)
    Chr    Encode          #1      #2
    ı      b'\xc4\xb1'     196     177
    ğ      b'\xc4\x9f'     196     159
    ş      b'\xc5\x9f'     197     159
    ü      b'\xc3\xbc'     195     188
    ö      b'\xc3\xb6'     195     182
    ç      b'\xc3\xa7'     195     167
    Ğ      b'\xc4\x9e'     196     158
    Ü      b'\xc3\x9c'     195     156
    Ş      b'\xc5\x9e'     197     158
    İ      b'\xc4\xb0'     196     176
    Ö      b'\xc3\x96'     195     150
    Ç      b'\xc3\x87'     195     135

    By Dincer Kavraal <dkavraal gmail.com>
    And Licensed under Apache Software Foundation License 2.0

 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.

    """

    i=0
    length = len(inText)-1 ## no need to look at the last
    while (i < length):
        if inText[i] == 195: #b'\xc3'
            i+=1
            if inText[i] in [135,150,156,167,182,188]:
                return True
            else:
                continue
        elif inText[i] == 196: #b'\xc4'
            i+=1
            if inText[i] in [158,159,176,177]:
                return True
            else:
                continue
        elif inText[i] == 197: #b'\xc5'
            i+=1
            if inText[i] in [158,159]:
                return True
            else:
                continue
        else:
            i+=1
    return False

def detectCharset(src, default="UTF-8"):
    """ Which charset is used in this Turkish text """
    if (src is None or not src):
        return default

    if isinstance(src, bytes):
        if findAny_TR_UTF8Chars(src):
            return "UTF-8"
        else:
            return "cp1254"
    elif isinstance(src, str):
        return default
	"""
	Letter Codes
	------------------------

	LETTER Windows-1254 ISO-8859-9 latin5 UTF-8
	ğ b'\xf0' b'\xf0' b'\xf0' b'\xc4\x9f'
	ı b'\xfd' b'\xfd' b'\xfd' b'\xc4\xb1'
	ş b'\xfe' b'\xfe' b'\xfe' b'\xc5\x9f'
	ü b'\xfc' b'\xfc' b'\xfc' b'\xc3\xbc'
	ö b'\xf6' b'\xf6' b'\xf6' b'\xc3\xb6'
	ç b'\xe7' b'\xe7' b'\xe7' b'\xc3\xa7'
	Ğ b'\xd0' b'\xd0' b'\xd0' b'\xc4\x9e'
	Ü b'\xdc' b'\xdc' b'\xdc' b'\xc3\x9c'
	Ş b'\xde' b'\xde' b'\xde' b'\xc5\x9e'
	İ b'\xdd' b'\xdd' b'\xdd' b'\xc4\xb0'
	Ö b'\xd6' b'\xd6' b'\xd6' b'\xc3\x96'
	Ç b'\xc7' b'\xc7' b'\xc7' b'\xc3\x87'


	How to distinguish
	------------------------

	##1

	for i in "ığşüöçĞÜŞİÖÇ":
	try:
	print(i, str(i.encode("utf-8").decode("cp1254")))
	except:
	print(i, "error")

	ı Ä±
	ğ ÄŸ
	ş ÅŸ
	ü Ã¼
	ö Ã¶
	ç Ã§
	Ğ error
	Ü Ãœ
	Ş error
	İ Ä°
	Ö Ã–
	Ç Ã‡

	##2

	b'\xc4'.decode("cp1254")

	'Ä'

	##3

	>>> for i in "ığşüöçĞÜŞİÖÇ":
	... print(i, str(i.encode("utf-8")), i.encode("utf-8")[0], i.encode("utf-8")[1], chr(i.encode("utf-8")[0]), chr(i.encode("utf-8")[1]), bytearray([ i.encode("utf-8")[0] ]).decode("cp1254"), bytearray([ i.encode("utf-8")[1] ]).decode("cp1254"))
	...
	ı b'\xc4\xb1' 196 177 Ä ± Ä ±
	ğ b'\xc4\x9f' 196 159 Ä Ä Ÿ
	ş b'\xc5\x9f' 197 159 Å Å Ÿ
	ü b'\xc3\xbc' 195 188 Ã ¼ Ã ¼
	ö b'\xc3\xb6' 195 182 Ã ¶ Ã ¶
	ç b'\xc3\xa7' 195 167 Ã § Ã §
	Ğ b'\xc4\x9e' 196 158 Ä Ä
	Ü b'\xc3\x9c' 195 156 Ã Ã œ
	Ş b'\xc5\x9e' 197 158 Å Å
	İ b'\xc4\xb0' 196 176 Ä ° Ä °
	Ö b'\xc3\x96' 195 150 Ã Ã –
	Ç b'\xc3\x87' 195 135 Ã Ã ‡

	actually this code:
	for i in "ığşüöçĞÜŞİÖÇ":
	try:
	print(i, "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(str(i.encode("utf-8")), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(i.encode("utf-8")[0], "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(i.encode("utf-8")[1], "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(chr(i.encode("utf-8")[0]), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(chr(i.encode("utf-8")[1]), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(bytearray([ i.encode("utf-8")[0] ]).decode("cp1254"), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(bytearray([ i.encode("utf-8")[1] ]).decode("cp1254"), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(bytearray([ i.encode("utf-8")[0] ]).decode("ISO-8859-9"), "\t", end="")
	except:
	print("#\t", end="")
	try:
	print(bytearray([ i.encode("utf-8")[1] ]).decode("ISO-8859-9"), "\t", end="")
	except:
	print("#\t", end="")
	print()



	##4
	So,

	Real UTF8 ord(x) char(x) cp1254(x)
	Chr Encode #1 #2 #1 #2 #1 #2
	ı b'\xc4\xb1' 196 177 Ä ± Ä ±
	ğ b'\xc4\x9f' 196 159 Ä <THIS!> Ä Ÿ
	ş b'\xc5\x9f' 197 159 Å <THIS!> Å Ÿ
	ü b'\xc3\xbc' 195 188 Ã ¼ Ã ¼
	ö b'\xc3\xb6' 195 182 Ã ¶ Ã ¶
	ç b'\xc3\xa7' 195 167 Ã § Ã §
	Ğ b'\xc4\x9e' 196 158 Ä <THIS!> Ä <THIS!>
	Ü b'\xc3\x9c' 195 156 Ã <THIS!> Ã œ
	Ş b'\xc5\x9e' 197 158 Å <THIS!> Å <THIS!>
	İ b'\xc4\xb0' 196 176 Ä ° Ä °
	Ö b'\xc3\x96' 195 150 Ã <THIS!> Ã –
	Ç b'\xc3\x87' 195 135 Ã <THIS!> Ã ‡





	##5

	MAYBE(m), SURE (s)

	NOT cp1254 NOT ISO-8859-9 cp1254 ISO-8859-9 UTF8 NOT UTF8 NOT-ASCII
	\x9e s s s s
	\x9f s s m s
	\x9c
	\x96 s m m s
	\x87 s m m s


	##6
	if the text(as bytearray) contains b'\x9e':
	it contains "Ş" or "Ğ" ## nearly impossible it is "Ÿ"
	it is UTF-8
	so on so forth


	"""

	def findAny_TR_UTF8Chars(inText):
	"""
	! PRECONDITION !: be sure text is in Turkish, it may get confused with Icelandic or etc // not tested w/those
	This method simply checks if any of these chars exists in the byte array (to verify it is UTF8-Turkish)
	This way you can understand if it is UTF-8 (True) or Windows-1254 (False) encoded

	Real UTF8 ord(x)
	Chr Encode #1 #2
	ı b'\xc4\xb1' 196 177
	ğ b'\xc4\x9f' 196 159
	ş b'\xc5\x9f' 197 159
	ü b'\xc3\xbc' 195 188
	ö b'\xc3\xb6' 195 182
	ç b'\xc3\xa7' 195 167
	Ğ b'\xc4\x9e' 196 158
	Ü b'\xc3\x9c' 195 156
	Ş b'\xc5\x9e' 197 158
	İ b'\xc4\xb0' 196 176
	Ö b'\xc3\x96' 195 150
	Ç b'\xc3\x87' 195 135

	By Dincer Kavraal <dkavraal gmail.com>
	And Licensed under Apache Software Foundation License 2.0

	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.

	"""

	i=0
	length = len(inText)-1 ## no need to look at the last
	while (i < length):
	if inText[i] == 195: #b'\xc3'
	i+=1
	if inText[i] in [135,150,156,167,182,188]:
	return True
	else:
	continue
	elif inText[i] == 196: #b'\xc4'
	i+=1
	if inText[i] in [158,159,176,177]:
	return True
	else:
	continue
	elif inText[i] == 197: #b'\xc5'
	i+=1
	if inText[i] in [158,159]:
	return True
	else:
	continue
	else:
	i+=1
	return False

	def detectCharset(src, default="UTF-8"):
	""" Which charset is used in this Turkish text """
	if (src is None or not src):
	return default

	if isinstance(src, bytes):
	if findAny_TR_UTF8Chars(src):
	return "UTF-8"
	else:
	return "cp1254"
	elif isinstance(src, str):
	return default