sohang3112/nltk_notes.py

## nltk_notes.py
import os
os.environ["NLTK_DATA"] = "/opt/nltk_data"              # Optional - set custom path for nltk data (to download & use)

import os.path
import nltk
from nltk.corpus import stopwords

# identify stopwords (eg. for, the, a, etc. in English) - words that can be removed without changing the meaning of the data
nltk.download('stopwords')
print(stopwords.words('english'))
print(stopwords.words('arabic'))
print('List of all supported languages:', stopwords.fileids())
# Interestingly, Hindi is not a supported language, but Hinglish is!!

# Stopwords at custom location
# NOTE: Instead of this, simpler method is to set NLTK_DATA environment variable BEFORE importing nltk
custom_nltk_download_path = "/custom/path/to/nltk/"
nltk.download(
    "stopwords",
    download_dir=custom_nltk_download_path  # stopwords/ folder will be downloaded inside this, in corpora/stopwords/
)
stopwords = nltk.corpus.LazyCorpusLoader(
    os.path.join(custom_nltk_download_path, corpora\stopwords),
    nltk.corpus.WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
)

# --------------------
nltk.download('punkt')       # punkt = punctuation
	import os
	os.environ["NLTK_DATA"] = "/opt/nltk_data" # Optional - set custom path for nltk data (to download & use)

	import os.path
	import nltk
	from nltk.corpus import stopwords

	# identify stopwords (eg. for, the, a, etc. in English) - words that can be removed without changing the meaning of the data
	nltk.download('stopwords')
	print(stopwords.words('english'))
	print(stopwords.words('arabic'))
	print('List of all supported languages:', stopwords.fileids())
	# Interestingly, Hindi is not a supported language, but Hinglish is!!

	# Stopwords at custom location
	# NOTE: Instead of this, simpler method is to set NLTK_DATA environment variable BEFORE importing nltk
	custom_nltk_download_path = "/custom/path/to/nltk/"
	nltk.download(
	"stopwords",
	download_dir=custom_nltk_download_path # stopwords/ folder will be downloaded inside this, in corpora/stopwords/
	)
	stopwords = nltk.corpus.LazyCorpusLoader(
	os.path.join(custom_nltk_download_path, corpora\stopwords),
	nltk.corpus.WordListCorpusReader, r"(?!README\|\.).*", encoding="utf8"
	)

	# --------------------
	nltk.download('punkt') # punkt = punctuation