import pandas as pd import re import csv import pycountry from collections import Counter # Reading all the text in raw email body and partitioning to words. word_list = [] email_dataframe = pd.read_csv("../input/Emails.csv") raw_text_list = email_dataframe['RawText'].tolist() for raw_text in raw_text_list: word_list = word_list + re.findall('\w+', raw_text.lower()) # Using pyCountry module to list down all the countries. We consider shorten forms of the countries as well. # Example: We consider all 3 formats of pakistan. (pakistan, pk, pak). # But if it is a language specific word such as 'us', we ignore those items from our search. country_list = [] english_checker = enchant.Dict("en_US") country_list = [] for country in pycountry.countries: country_list.append(country.name.strip().lower()) if not english_checker.check(country.alpha2.lower()): country_list.append(country.alpha2.lower()) if not english_checker.check(country.alpha3.lower()): country_list.append(country.alpha3.lower()) high_frequency_countries = Counter(x for x in word_list if x in country_list).most_common(100)