Skip to content

Instantly share code, notes, and snippets.

@yokawasa
Last active November 18, 2021 22:57
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yokawasa/e41c1517700ebc6f67df to your computer and use it in GitHub Desktop.
Save yokawasa/e41c1517700ebc6f67df to your computer and use it in GitHub Desktop.
RSS Crawler - Crawling and parsing data with feedparser and storing results into Azure DocumentDB with pydocumentdb
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import feedparser
import pydocumentdb.documents as documents
import pydocumentdb.document_client as document_client
import pydocumentdb.errors as errors
import pydocumentdb.http_constants as http_constants
Docdb_masterKey = '<Your Documentdb master key string>'
Docdb_host = 'https://<documentdb account>.documents.azure.com:443/'
Docdb_dbname = '<documentdb database name>'
Docdb_colname = '<documentdb collection name>'
feedurl='http://blogs.msdn.com/b/windowsazurej/atom.aspx'
def rsscrawling():
# create documentDb client instance
client = document_client.DocumentClient(Docdb_host,
{'masterKey': Docdb_masterKey})
# create a database if not yet created
database_definition = {'id': Docdb_dbname }
databases = list(client.QueryDatabases({
'query': 'SELECT * FROM root r WHERE r.id=@id',
'parameters': [
{ 'name':'@id', 'value': database_definition['id'] }
]
}))
if ( len(databases) > 0 ):
feeddb = databases[0]
else:
print "database is created:%s" % Docdb_dbname
feeddb = client.CreateDatabase(database_definition)
# create a collection if not yet created
collection_definition = { 'id': Docdb_colname }
collections = list(client.QueryCollections(
feeddb['_self'],
{
'query': 'SELECT * FROM root r WHERE r.id=@id',
'parameters': [
{ 'name':'@id', 'value': collection_definition['id'] }
]
}))
if ( len(collections) > 0 ):
collection = collections[0]
else:
print "collection is created:%s" % Docdb_colname
collection = client.CreateCollection(
feeddb['_self'], collection_definition)
# request & parse rss feed via feedparser
feed=feedparser.parse(feedurl)
for entry in feed[ 'entries' ]:
document_definition = { 'title':entry[ 'title'],
'content':entry['description'],
'permalink':entry[ 'link' ],
'postdate':entry['date'] }
# check if duplicated
documents = list(client.QueryDocuments(
collection['_self'],
{
'query': 'SELECT * FROM root r WHERE r.permalink=@permalink',
'parameters': [
{ 'name':'@permalink', 'value':document_definition['permalink'] }
]
}))
if (len(documents) < 1):
# only create if it's fully new document
print "document is added:title:%s" % entry['title']
created_document = client.CreateDocument(
collection['_self'], document_definition)
if __name__ == '__main__':
rsscrawling()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment