This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
nltk.download('stopwords') | |
# download stopwords list from nltk | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) | |
def clean_text(text): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
re.findall(r"(\d{4})-(\d{2})-(\d{2})", date) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
html = """<table class="vertical-navbox nowraplinks" style="float:right;clear:right;width:22.0em;margin:0 0 1.0em 1.0em;background:#f9f9f9;border:1px solid #aaa;padding:0.2em;border-spacing:0.4em 0;text-align:center;line-height:1.4em;font-size:88%"><tbody><tr><th style="padding:0.2em 0.4em 0.2em;font-size:145%;line-height:1.2em"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> and<br /><a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td style="padding:0.2em 0 0.4em;padding:0.25em 0.25em 0.75em;"><a href="/wiki/File:Kernel_Machine.svg" class="image"><img alt="Kernel Machine.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Kernel_Machine.svg/220px-Kernel_Machine.svg.png" decoding="async" width="220" height="100" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Kernel_Machine.svg/330px-Kernel_Machine.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Kernel_Machine.svg/440px-Kernel_Machine.svg.png 2x" data-file-widt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# give your filename here | |
with open("filename.txt", "r") as fp: | |
text = fp.read() | |
re.findall(r"[\w.-]+@[\w.-]+", text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# insert your text here | |
text = "" | |
re.findall(r"[\w.-]+@[\w.-]+", text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from indicnlp.script import indic_scripts as isc | |
from indicnlp.script import phonetic_sim as psim | |
c1='क' | |
c2='ख' | |
c3='भ' | |
lang='hi' | |
print('Similarity between {} and {}'.format(c1,c2)) | |
print(psim.cosine( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from indicnlp.langinfo import * | |
# Input character | |
c='आ' | |
# Language is Hindi or 'hi' | |
lang='hi' | |
print('Is vowel?: {}'.format(is_vowel(c,lang))) | |
print('Is consonant?: {}'.format(is_consonant(c,lang))) | |
print('Is velar?: {}'.format(is_velar(c,lang))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator | |
# Input text "Today the weather is good. Sun is bright and there are no signs of rain. Hence we can play today." | |
input_text='आज मौसम अच्छा है। सूरज उज्ज्वल है और बारिश के कोई संकेत नहीं हैं। इसलिए हम आज खेल सकते हैं!' | |
# Transliterate from Hindi to Telugu | |
print(UnicodeIndicTransliterator.transliterate(input_text,"hi","te")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from indicnlp.tokenize import sentence_tokenize | |
indic_string="""तो क्या विश्व कप 2019 में मैच का बॉस टॉस है? यानी मैच में हार-जीत में \ | |
टॉस की भूमिका अहम है? आप ऐसा सोच सकते हैं। विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों \ | |
पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।""" | |
# Split the sentence, language code "hi" is passed for hingi | |
sentences=sentence_tokenize.sentence_split(indic_string, lang='hi') | |
# print the sentences |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from indicnlp import common | |
# The path to the local git repo for Indic NLP library | |
INDIC_NLP_LIB_HOME=r"indic_nlp_library" | |
# The path to the local git repo for Indic NLP Resources | |
INDIC_NLP_RESOURCES=r"indic_nlp_resources" | |
# Add library to Python path |