Skip to content

Instantly share code, notes, and snippets.

@Bishwas-py
Created March 3, 2022 16:39
Show Gist options
  • Save Bishwas-py/067055ce8c0d770956a554bf9d63ce50 to your computer and use it in GitHub Desktop.
Save Bishwas-py/067055ce8c0d770956a554bf9d63ce50 to your computer and use it in GitHub Desktop.
webscraping and multiprocessing with py
def get_set_data(link, mp_data):
try:
articles = Article() # gets article information
article_data = {
'url': article.url,
'summary': article.summary,
'content': article.text,
'keywords': article.keywords,
'tags': article.tags
}
article_words = article.text.split(' ')
article_paragraphs = article.text.splitlines()
if len(article_paragraphs) <= 0:
word_paragraph_ratio = 0
else:
word_paragraph_ratio = len(article_words) / len(article_paragraphs)
article_data['word_paragraph_ratio'] = word_paragraph_ratio
article_data['paragraph_count'] = len(article_paragraphs)
article_data['word_count'] = len(article_words)
# Article High Quality Detection
if len(article_words) >= 250:
if word_paragraph_ratio >= 6:
mp_data['site_info']['hq_pages_num'].value += 1
article_data['hq'] = True
else:
article_data['hq'] = False
mp_data['articles'].append(article_data) # appender...
mp_data['site_info']['pages_num'].value += 1
except ArticleException:
pass
@Bishwas-py
Copy link
Author

Here's mp_data:


    mp_data = manager.dict()

    mp_data.update(
        {
            'site_info': {
                'pages_num': manager.Value('i', 0),
                'hq_pages_num': manager.Value('i', 0)
            },
            'articles': manager.list()
        }
    )

@Bishwas-py
Copy link
Author

Bishwas-py commented Mar 3, 2022

More...

    mp_data = manager.dict()
    mp_data.update(
        {
            'site_info': {
                'pages_num': manager.Value('i', 0),
                'hq_pages_num': manager.Value('i', 0)
            },
            'articles': manager.list()
        }
    )

    processes = []
    added_articles = []
    for i, link in enumerate(links):
        if link not in added_articles:
            process = Process(target=get_set_data, args=(link, mp_data,))
            processes.append(process)
            process.start()
            added_articles.append(link)

    for process in processes:
        print("Before Joining:  ", process.is_alive())
        process.join()
        print("After Joining:  ", process.is_alive())
    data = {
        'site_info': {
            'pages_num': mp_data['site_info']['pages_num'].value,
            'hq_pages_num': mp_data['site_info']['hq_pages_num'].value
        },
        'articles': list(mp_data['articles'])
    }

@kbshal
Copy link

kbshal commented Mar 3, 2022

mp_data['articles']=mp_data['articles'] + article_data

@kbshal
Copy link

kbshal commented Mar 3, 2022

from multiprocessing import Process, Manager

def f(d):
d[1] = d[1] + [4]
d[1]=d[1]+[5]
d[1]=d[1]+['fjasfkjs']

print(d)

if name == 'main':
manager = Manager() # create only 1 mgr
d = manager.dict() # create only 1 dict
d[1] = []
p = Process(target=f,args=(d,)) # say to 'f', in which 'd' it should append
p.start()
p.join()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment