Skip to content

Instantly share code, notes, and snippets.

@typehorror
Last active February 19, 2024 03:06
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save typehorror/21831e27ac6200be9433 to your computer and use it in GitHub Desktop.
Save typehorror/21831e27ac6200be9433 to your computer and use it in GitHub Desktop.
Detecting Similar News

Detecting Similar News

The following gist is an extract of the article Detecting Similar News. It exploit data retrieve by a crawler and detect similar article across different domains

Usage

Start by running the crawler to retrieve the data. Crawler takes about 50 minutes to retrieve all the data the first time.

$ python run.py
retrieving url... [techcrunch.com] /
retrieving url... [techcrunch.com] /2014/08/02/...

Once done, run the plagia script, it will read the database imported by the crawler, extract the articles and find matches across the different domains

$ python plagia.py
#!/usr/bin/python
# filename: plagia.py
# native libs
import sys
import re
from collections import defaultdict
# external libs
from bs4 import BeautifulSoup
# local libs
from crawler import CrawlerCache
crawler_cache = CrawlerCache('crawler.db')
# Config per domain
sites = (
{
'domain': 'techcrunch.com',
'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
'get_content': lambda page: page.find('div', 'article-entry'),
'words': defaultdict(int),
'urls': {},
},
{
'domain': 'www.engadget.com',
'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
'get_content': lambda page: page.find('div', 'post-body'),
'words': defaultdict(int),
'urls': {},
},
{
'domain': 'gizmodo.com',
'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match,
'get_content': lambda page: page.find('article', 'post'),
'words': defaultdict(int),
'urls': {},
},
{
'domain': 'www.zdnet.com',
'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match,
'get_content': lambda page: page.find('article', 'post'),
'words': defaultdict(int),
'urls': {},
},
{
'domain': 'www.wired.com',
'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match,
'get_content': lambda page: page.find('article', 'post'),
'words': defaultdict(int),
'urls': {},
}
)
# store all the words for statistical value
# filter the URLs, extract the content, get the words
for site in sites:
domain = site['domain']
# retrieve all the URL for current domain matching an article format
urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)])
for url in urls:
html = crawler_cache.get(domain=domain, url=url)
if html:
# User beautifulSoup to navigate the document
page = BeautifulSoup(html)
# retrive the content of the article
content = site['get_content'](page)
if content:
# remove script tag content from article
# yes, there is JS in the article :/
[s.clear() for s in content.find_all('script')]
# trim the tags from the article
article_words = content.get_text().split()
# articles with less than 200 words kind of suck
# so let ignore those
if len(article_words) > 200:
# keep uniq words by putting them in a set
article_words = set(w.lower() for w in article_words)
site['urls'][url] = article_words
# count the words occurence (per domain and globally)
for word in article_words:
site['words'][word] += 1
# Now lets remove words common in the article of the domain
for site in sites:
# words present over 5% of the articles of the domain are removed
threshold = len(site['urls']) * .05
noisy_words = set(w for w, c in site['words'].items() if c > threshold)
for url in site['urls'].keys():
# remove part using set difference feature, pretty sweet
site['urls'][url] = site['urls'][url].difference(noisy_words)
# We can now compare article to each others across domains
plagia = defaultdict(list)
for site in sites:
for other_site in sites:
# We don't match site against itself :|
if other_site['domain'] == site['domain']:
continue
# grab every articles for the domain
for url in site['urls'].keys():
# words on the current article
words = site['urls'][url]
# minumum match has to be 10%
best_score = len(words) * .1
match = ''
# compare article to the another domain's articles
for other_url in other_site['urls'].keys():
# words in the article from another domain
other_words = other_site['urls'][other_url]
# count how many common "rare" words
score = len(words.intersection(other_words))
if score > best_score:
# woohoo, if you're here you're the new best match
match = other_url
best_score = score
if match:
full_url = 'http://%s%s' % (site['domain'], url)
full_other_url = 'http://%s%s' % (other_site['domain'], match)
plagia[full_url].append((
best_score,
(best_score * 100.0) / len(words), # percentage
full_other_url,
))
for url, matches in plagia.items():
print url
for match in matches:
print '\t%s\t%.d%%\t%s' % match
#!/usr/bin/python
# filename: run.py
import re
from crawler import Crawler, CrawlerCache
if __name__ == "__main__":
# Using SQLite as a cache to avoid pulling twice
crawler = Crawler(CrawlerCache('crawler.db'))
root_re = re.compile('^/$').match
crawler.crawl('http://techcrunch.com/', no_cache=root_re)
crawler.crawl('http://www.engadget.com/', no_cache=root_re)
crawler.crawl('http://gizmodo.com/', no_cache=root_re)
crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
crawler.crawl('http://www.wired.com/', no_cache=root_re)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment