codingdudecom/nlp.py

## nlp.py
			from js import fetch
			import nltk
			from nltk.util import ngrams
			from pathlib import Path
			import os, sys, io, zipfile

			stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
			stopwords = stopwords.split(",")

			punkt_downloaded = False
			async def download_punkt():
				global punkt_downloaded
				if not punkt_downloaded:
					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
					js_buffer = await response.arrayBuffer()
					py_buffer = js_buffer.to_py()  # this is a memoryview
					stream = py_buffer.tobytes()  # now we have a bytes object

					d = Path("/nltk_data/tokenizers")
					d.mkdir(parents=True, exist_ok=True)

					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

					# extract punkt.zip
					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
					    path='/nltk_data/tokenizers/'
					)
					punkt_downloaded = True


			async def extract_keywords(text):
				global punkt_downloaded
				if not punkt_downloaded:
					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
					js_buffer = await response.arrayBuffer()
					py_buffer = js_buffer.to_py()  # this is a memoryview
					stream = py_buffer.tobytes()  # now we have a bytes object

					d = Path("/nltk_data/tokenizers")
					d.mkdir(parents=True, exist_ok=True)

					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

					# extract punkt.zip
					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
					    path='/nltk_data/tokenizers/'
					)
					punkt_downloaded = True

				# check file contents in /nltk_data/tokenizers/
				# print(os.listdir("/nltk_data/tokenizers/punkt"))

				# return nltk.word_tokenize(text)
				words = nltk.word_tokenize(text)
				words = [word for word in words if word.isalnum()]
				filtered_words = [word for word in words if word.lower() not in stopwords]

				# Create bi-grams and tri-grams
				bigrams = list(ngrams(filtered_words, 2))
				trigrams = list(ngrams(filtered_words, 3))
				quadgrams = list(ngrams(filtered_words, 4))

				# Calculate frequency distributions for bi-grams and tri-grams
				bigram_freq_dist = nltk.FreqDist(bigrams)
				trigram_freq_dist = nltk.FreqDist(trigrams)
				quadgram_freq_dist = nltk.FreqDist(quadgrams)

				data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)

				# Get the top N words
				# top_keywords = [word for word, freq in word_freq.most_common(10)]
				formatted_data = [[" ".join(keyword), count] for keyword, count in data]

				return formatted_data
	from js import fetch
	import nltk
	from nltk.util import ngrams
	from pathlib import Path
	import os, sys, io, zipfile

	stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
	stopwords = stopwords.split(",")

	punkt_downloaded = False
	async def download_punkt():
	global punkt_downloaded
	if not punkt_downloaded:
	response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
	js_buffer = await response.arrayBuffer()
	py_buffer = js_buffer.to_py() # this is a memoryview
	stream = py_buffer.tobytes() # now we have a bytes object

	d = Path("/nltk_data/tokenizers")
	d.mkdir(parents=True, exist_ok=True)

	Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

	# extract punkt.zip
	zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
	path='/nltk_data/tokenizers/'
	)
	punkt_downloaded = True


	async def extract_keywords(text):
	global punkt_downloaded
	if not punkt_downloaded:
	response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
	js_buffer = await response.arrayBuffer()
	py_buffer = js_buffer.to_py() # this is a memoryview
	stream = py_buffer.tobytes() # now we have a bytes object

	d = Path("/nltk_data/tokenizers")
	d.mkdir(parents=True, exist_ok=True)

	Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

	# extract punkt.zip
	zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
	path='/nltk_data/tokenizers/'
	)
	punkt_downloaded = True

	# check file contents in /nltk_data/tokenizers/
	# print(os.listdir("/nltk_data/tokenizers/punkt"))

	# return nltk.word_tokenize(text)
	words = nltk.word_tokenize(text)
	words = [word for word in words if word.isalnum()]
	filtered_words = [word for word in words if word.lower() not in stopwords]

	# Create bi-grams and tri-grams
	bigrams = list(ngrams(filtered_words, 2))
	trigrams = list(ngrams(filtered_words, 3))
	quadgrams = list(ngrams(filtered_words, 4))

	# Calculate frequency distributions for bi-grams and tri-grams
	bigram_freq_dist = nltk.FreqDist(bigrams)
	trigram_freq_dist = nltk.FreqDist(trigrams)
	quadgram_freq_dist = nltk.FreqDist(quadgrams)

	data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)

	# Get the top N words
	# top_keywords = [word for word, freq in word_freq.most_common(10)]
	formatted_data = [[" ".join(keyword), count] for keyword, count in data]

	return formatted_data