Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
قطعه کدی برای بررسی و آنالیز تکرار واژگان در ضرب‌المثل‌های فارسی
import urllib.request
import arabic_reshaper
from bs4 import BeautifulSoup
from collections import Counter
from bidi.algorithm import get_display
from persian_wordcloud.wordcloud import PersianWordCloud
def get():
links = list()
with open('links') as f:
for line in f:
links.append(line.replace('\n', ''))
for l in links:
print(l)
html = urllib.request.urlopen(l)
soup = BeautifulSoup(html, 'lxml')
data = soup.findAll('p')
for i in data:
t = str(i)
t = t.split('>')
t = t[2].split('<')
t = t[0].replace('ضرب المثل هاي ايراني', '')
with open('db', 'a') as db:
db.write(t + '\n')
print(t)
def convert(text):
new_text = arabic_reshaper.reshape(text)
bidi_text = get_display(new_text)
return bidi_text
def wc():
wl = []
text = ''
swl = []
with open('stopwords.txt', 'r') as f:
for line in f:
for s in line.split():
swl.append(str(s))
stopwords = set(swl)
with open('db', 'r') as f:
for line in f:
for s in line.split():
if s not in stopwords:
wl.append(str(convert(s.replace('،', ''))))
text = '\n'.join(wl)
wordcloud = PersianWordCloud(
only_persian=True,
max_words=150,
margin=5,
width=800,
height=800,
min_font_size=1,
colormap='Accent',
max_font_size=500,
background_color="white"
).generate(text)
image = wordcloud.to_image()
image.show()
image.save('result.png')
def count():
swl = []
with open('stopwords.txt', 'r') as f:
for line in f:
for s in line.split():
swl.append(str(s))
with open('db', 'r') as f:
words = f.read().split()
wordCount = dict(Counter(words))
wordCount = sorted(wordCount.items(), key=lambda x: x[1])
for i in wordCount:
if i[0] not in swl:
print(i[0] + ' :: ' + str(i[1]))
def main():
get()
count()
wc()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment