Skip to content

Instantly share code, notes, and snippets.

@Bishwas-py
Created March 3, 2022 16:39
Show Gist options
  • Save Bishwas-py/067055ce8c0d770956a554bf9d63ce50 to your computer and use it in GitHub Desktop.
Save Bishwas-py/067055ce8c0d770956a554bf9d63ce50 to your computer and use it in GitHub Desktop.
webscraping and multiprocessing with py
def get_set_data(link, mp_data):
try:
articles = Article() # gets article information
article_data = {
'url': article.url,
'summary': article.summary,
'content': article.text,
'keywords': article.keywords,
'tags': article.tags
}
article_words = article.text.split(' ')
article_paragraphs = article.text.splitlines()
if len(article_paragraphs) <= 0:
word_paragraph_ratio = 0
else:
word_paragraph_ratio = len(article_words) / len(article_paragraphs)
article_data['word_paragraph_ratio'] = word_paragraph_ratio
article_data['paragraph_count'] = len(article_paragraphs)
article_data['word_count'] = len(article_words)
# Article High Quality Detection
if len(article_words) >= 250:
if word_paragraph_ratio >= 6:
mp_data['site_info']['hq_pages_num'].value += 1
article_data['hq'] = True
else:
article_data['hq'] = False
mp_data['articles'].append(article_data) # appender...
mp_data['site_info']['pages_num'].value += 1
except ArticleException:
pass
@kbshal
Copy link

kbshal commented Mar 3, 2022

mp_data['articles']=mp_data['articles'] + article_data

@kbshal
Copy link

kbshal commented Mar 3, 2022

from multiprocessing import Process, Manager

def f(d):
d[1] = d[1] + [4]
d[1]=d[1]+[5]
d[1]=d[1]+['fjasfkjs']

print(d)

if name == 'main':
manager = Manager() # create only 1 mgr
d = manager.dict() # create only 1 dict
d[1] = []
p = Process(target=f,args=(d,)) # say to 'f', in which 'd' it should append
p.start()
p.join()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment