typehorror/detector.md

## detector.md

      
    Raw
  

              detector.md
            
          
    Detecting Similar News

The following gist is an extract of the article Detecting Similar News. It exploit data retrieve by a crawler and detect similar article across different domains
Usage

Start by running the crawler to retrieve the data. Crawler takes about 50 minutes to retrieve all the data the first time.
$ python run.py
retrieving url... [techcrunch.com] /
retrieving url... [techcrunch.com] /2014/08/02/...

Once done, run the plagia script, it will read the database imported by the crawler, extract the articles and find matches across the different domains
$ python plagia.py


## plagia.py
#!/usr/bin/python
# filename: plagia.py

# native libs
import sys
import re
from collections import defaultdict

# external libs
from bs4 import BeautifulSoup

# local libs
from crawler import CrawlerCache

crawler_cache = CrawlerCache('crawler.db')

# Config per domain
sites = (
    {
        'domain': 'techcrunch.com',
        'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
        'get_content': lambda page: page.find('div', 'article-entry'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.engadget.com',
        'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
        'get_content': lambda page: page.find('div', 'post-body'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'gizmodo.com',
        'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.zdnet.com',
        'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.wired.com',
        'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    }
)

# store all the words for statistical value

# filter the URLs, extract the content, get the words
for site in sites:
    domain = site['domain']
    # retrieve all the URL for current domain matching an article format
    urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)])
    for url in urls:
        html = crawler_cache.get(domain=domain, url=url)
        if html:
            # User beautifulSoup to navigate the document
            page = BeautifulSoup(html)
            # retrive the content of the article
            content = site['get_content'](page)
            if content:
                # remove script tag content from article
                # yes, there is JS in the article :/
                [s.clear() for s in content.find_all('script')]
                # trim the tags from the article
                article_words = content.get_text().split()
                # articles with less than 200 words kind of suck
                # so let ignore those
                if len(article_words) > 200:
                    # keep uniq words by putting them in a set
                    article_words = set(w.lower() for w in article_words)
                    site['urls'][url] = article_words
                    # count the words occurence (per domain and globally)
                    for word in article_words:
                        site['words'][word] += 1


# Now lets remove words common in the article of the domain
for site in sites:
    # words present over 5% of the articles of the domain are removed
    threshold = len(site['urls']) * .05
    noisy_words = set(w for w, c in site['words'].items() if c > threshold)
    for url in site['urls'].keys():
        # remove part using set difference feature, pretty sweet
        site['urls'][url] = site['urls'][url].difference(noisy_words)


# We can now compare article to each others across domains
plagia = defaultdict(list)
for site in sites:
    for other_site in sites:
        # We don't match site against itself :|
        if other_site['domain'] == site['domain']:
            continue
        # grab every articles for the domain
        for url in site['urls'].keys():
            # words on the current article
            words = site['urls'][url]
            # minumum match has to be 10%
            best_score = len(words) * .1
            match = ''
            # compare article to the another domain's articles
            for other_url in other_site['urls'].keys():
                # words in the article from another domain
                other_words = other_site['urls'][other_url]
                # count how many common "rare" words
                score = len(words.intersection(other_words))
                if score > best_score:
                    # woohoo, if you're here you're the new best match
                    match = other_url
                    best_score = score
            if match:
                full_url = 'http://%s%s' % (site['domain'], url)
                full_other_url = 'http://%s%s' % (other_site['domain'], match)
                plagia[full_url].append((
                    best_score,
                    (best_score * 100.0) / len(words), # percentage
                    full_other_url,
                ))

for url, matches in plagia.items():
    print url
    for match in matches:
        print '\t%s\t%.d%%\t%s' % match

## run.py
#!/usr/bin/python
# filename: run.py
import re
from crawler import Crawler, CrawlerCache

if __name__ == "__main__":
    # Using SQLite as a cache to avoid pulling twice
    crawler = Crawler(CrawlerCache('crawler.db'))
    root_re = re.compile('^/$').match
    crawler.crawl('http://techcrunch.com/', no_cache=root_re)
    crawler.crawl('http://www.engadget.com/', no_cache=root_re)
    crawler.crawl('http://gizmodo.com/', no_cache=root_re)
    crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
    crawler.crawl('http://www.wired.com/', no_cache=root_re)
	#!/usr/bin/python
	# filename: plagia.py

	# native libs
	import sys
	import re
	from collections import defaultdict

	# external libs
	from bs4 import BeautifulSoup

	# local libs
	from crawler import CrawlerCache

	crawler_cache = CrawlerCache('crawler.db')

	# Config per domain
	sites = (
	{
	'domain': 'techcrunch.com',
	'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
	'get_content': lambda page: page.find('div', 'article-entry'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.engadget.com',
	'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
	'get_content': lambda page: page.find('div', 'post-body'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'gizmodo.com',
	'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.zdnet.com',
	'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.wired.com',
	'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	}
	)

	# store all the words for statistical value

	# filter the URLs, extract the content, get the words
	for site in sites:
	domain = site['domain']
	# retrieve all the URL for current domain matching an article format
	urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)])
	for url in urls:
	html = crawler_cache.get(domain=domain, url=url)
	if html:
	# User beautifulSoup to navigate the document
	page = BeautifulSoup(html)
	# retrive the content of the article
	content = site['get_content'](page)
	if content:
	# remove script tag content from article
	# yes, there is JS in the article :/
	[s.clear() for s in content.find_all('script')]
	# trim the tags from the article
	article_words = content.get_text().split()
	# articles with less than 200 words kind of suck
	# so let ignore those
	if len(article_words) > 200:
	# keep uniq words by putting them in a set
	article_words = set(w.lower() for w in article_words)
	site['urls'][url] = article_words
	# count the words occurence (per domain and globally)
	for word in article_words:
	site['words'][word] += 1


	# Now lets remove words common in the article of the domain
	for site in sites:
	# words present over 5% of the articles of the domain are removed
	threshold = len(site['urls']) * .05
	noisy_words = set(w for w, c in site['words'].items() if c > threshold)
	for url in site['urls'].keys():
	# remove part using set difference feature, pretty sweet
	site['urls'][url] = site['urls'][url].difference(noisy_words)


	# We can now compare article to each others across domains
	plagia = defaultdict(list)
	for site in sites:
	for other_site in sites:
	# We don't match site against itself :\|
	if other_site['domain'] == site['domain']:
	continue
	# grab every articles for the domain
	for url in site['urls'].keys():
	# words on the current article
	words = site['urls'][url]
	# minumum match has to be 10%
	best_score = len(words) * .1
	match = ''
	# compare article to the another domain's articles
	for other_url in other_site['urls'].keys():
	# words in the article from another domain
	other_words = other_site['urls'][other_url]
	# count how many common "rare" words
	score = len(words.intersection(other_words))
	if score > best_score:
	# woohoo, if you're here you're the new best match
	match = other_url
	best_score = score
	if match:
	full_url = 'http://%s%s' % (site['domain'], url)
	full_other_url = 'http://%s%s' % (other_site['domain'], match)
	plagia[full_url].append((
	best_score,
	(best_score * 100.0) / len(words), # percentage
	full_other_url,
	))

	for url, matches in plagia.items():
	print url
	for match in matches:
	print '\t%s\t%.d%%\t%s' % match
	#!/usr/bin/python
	# filename: run.py
	import re
	from crawler import Crawler, CrawlerCache

	if __name__ == "__main__":
	# Using SQLite as a cache to avoid pulling twice
	crawler = Crawler(CrawlerCache('crawler.db'))
	root_re = re.compile('^/$').match
	crawler.crawl('http://techcrunch.com/', no_cache=root_re)
	crawler.crawl('http://www.engadget.com/', no_cache=root_re)
	crawler.crawl('http://gizmodo.com/', no_cache=root_re)
	crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
	crawler.crawl('http://www.wired.com/', no_cache=root_re)