#Characters used to build a distribution alphabet = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",",",";","-"] #Languages supported languages = ["english","italian","french","german"] #A useful dictionary distribDict = dict() #The following functon takes a list and a string of characters, #it calculates how often a certain character appears and then #it outputs a list with character and frequency def frequencies(string,letters): list_frequencies = [] for letter in letters: freq = 0 for i in string: if i == letter: freq += 1 list_frequencies.append(letter) list_frequencies.append(freq) return list_frequencies #This function returns a list containing 2 lists with letter #and frequencies def fix_lists_letter(list_1): list_letters = [] list_letters.append(list_1[0]) list_freq = [] for i in range(1,len(list_1)): if i % 2 == 0: list_letters.append(list_1[i]) else: list_freq.append(list_1[i]) if len(list_letters) != len(list_freq): return "Some error occurred" else: final_list = [list_letters,list_freq] return final_list #This function returns the relative frequencies def get_rel_freq(list_1): list_to_ret = [] for i in list_1: list_to_ret.append(i/sum(list_1)) return list_to_ret #This function should return the distribution of the characters #in a given text by putting together most of the functions above def returnDistribution(strings,alphaBet): firstC = frequencies(strings,alphaBet) finalC = fix_lists_letter(firstC) letters = finalC[0] frequenc = get_rel_freq(finalC[1]) distribution = [letters,frequenc] nChar = sum(finalC[1]) #Note: Spaces " " are NOT considered as characters print("Number of character used:", nChar, sep=" ") return distribution #This function loads each distribution into the dictionary distribDict def loadDistribDict(): try: for lang in languages: fileToRead = open("C:\\Users\\desktop\\lproject\\"+lang+"Dist.txt","r") data = fileToRead.read() dist = data.split("\n")[1].split(" ") distList = [] for number in dist: if number == '': number = 0 distList.append(float(number)) distribDict[lang] = distList fileToRead.close() print("Loaded",lang,"character frequency distribution!",sep=" ") except Exception as e: print(e) #String to test stringToCheck = "Hallo diese ist eine schoene Satze auf deutsch" commonEnglishWords = [" is "," the "," of "," and "," to "," that "," for "," it "," as "," with "," be "," by "," this "," are "," or "," his "," from "," at "," which "," but "," they "," you "," we "," she "," there "," have "," had "," has "," yes "] commonGermanWords = [" ein "," das "," ist "," der "," ich "," nicht "," es "," und "," Sie "," wir "," zu "," er "," sie "," mir "," ja "," wie "," den "," auf "," mich "," dass "," hier "," wenn "," sind "," eine "," von "," dich "," dir "," noch "," bin "," uns "," kann "," dem "] commonItalianWords = [" di "," che ", " il "," per "," gli "," una "," sono ", " ho "," lo "," ha "," le "," ti "," con "," cosa "," come "," ci "," questo "," hai "," sei "," del "," bene "," era "," mio "," solo ", " gli "," tutto "," della "," mia "," fatto "] commonFrenchWords = [" avoir "," est "," je "," pas "," et "," aller "," les "," en "," faire "," tout "," que "," pour "," une "," mes "," vouloir "," pouvoir "," nous "," dans "," savoir "," bien "," mon ", " au "," avec "," moi "," quoi "," devoir "," oui "," comme "," ils "] commonWordsDict = {"english":commonEnglishWords,"german":commonGermanWords,"italian":commonItalianWords,"french":commonFrenchWords} def checkLang(string): distToCheck = returnDistribution(string,alphabet) distToCheckFreq = distToCheck[1] diffDict = dict() #For each language we calculate the difference between the #observed distribution and the given one. for lang in languages: diffList =[] for i in range(len(languages)-1): diff = abs(distToCheckFreq[i]-distribDict[lang][i]) diffList.append(diff) diffDict[lang]=sum(diffList) #verifica for lang in languages: print(lang,diffDict[lang]) langFound = min(diffDict, key=diffDict.get) #If the sample sentence is shorter than 420 characters then #we may have some recognition issues which will be dealt #here below.. langChecked = "" correct = False if len(string) < 420: for langKey in commonWordsDict.keys(): for word in commonWordsDict[langKey]: if word in string: langChecked = langKey correct = True break if correct: break if correct: print("Lang found: ",langFound) print("Lang checked: ",langChecked) langFound = langChecked #The language found is returned here print("\n") return langFound loadDistribDict() print("\n") print("Language found by the program: ",checkLang(stringToCheck))