Last active
April 30, 2024 05:35
-
-
Save sohang3112/68c801d7afe7b49e7762641b41aad520 to your computer and use it in GitHub Desktop.
Notes on nltk - Natural Language text processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ["NLTK_DATA"] = "/opt/nltk_data" # Optional - set custom path for nltk data (to download & use) | |
import os.path | |
import nltk | |
from nltk.corpus import stopwords | |
# identify stopwords (eg. for, the, a, etc. in English) - words that can be removed without changing the meaning of the data | |
nltk.download('stopwords') | |
print(stopwords.words('english')) | |
print(stopwords.words('arabic')) | |
print('List of all supported languages:', stopwords.fileids()) | |
# Interestingly, Hindi is not a supported language, but Hinglish is!! | |
# Stopwords at custom location | |
# NOTE: Instead of this, simpler method is to set NLTK_DATA environment variable BEFORE importing nltk | |
custom_nltk_download_path = "/custom/path/to/nltk/" | |
nltk.download( | |
"stopwords", | |
download_dir=custom_nltk_download_path # stopwords/ folder will be downloaded inside this, in corpora/stopwords/ | |
) | |
stopwords = nltk.corpus.LazyCorpusLoader( | |
os.path.join(custom_nltk_download_path, corpora\stopwords), | |
nltk.corpus.WordListCorpusReader, r"(?!README|\.).*", encoding="utf8" | |
) | |
# -------------------- | |
nltk.download('punkt') # punkt = punctuation |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment