Created
December 9, 2014 05:29
-
-
Save rajat404/c11df37d43f9b1020d47 to your computer and use it in GitHub Desktop.
dedup project-phase1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<center><h1><u><b>Textual Analysis for Detection & Removal of Duplicates</b></u></h1></center>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "-" | |
} | |
}, | |
"source": [ | |
"<h2><center>About</center></h2>\n", | |
"<li>The aim of this project is to find and remove duplicate or near-duplicate from text\n", | |
"<li>Here we are taking the specific case of tweets (from Twitter)\n", | |
"<li>This project is aimed to reduce the amount of redundant data we see across the internet, primarily to converse time\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h2>Authentication</h2>\n", | |
"We shall use the access token and API secrets in the file keys.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import json\n", | |
"import twitter\n", | |
"authval = json.load(open(\"keys.txt\"))\n", | |
"CONSUMER_KEY = authval['CONSUMER_KEY']\n", | |
"CONSUMER_SECRET = authval['CONSUMER_SECRET']\n", | |
"OAUTH_TOKEN = authval['OAUTH_TOKEN'] \n", | |
"OAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n", | |
"\n", | |
"auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n", | |
" CONSUMER_KEY, CONSUMER_SECRET)\n", | |
"\n", | |
"t = twitter.Twitter(auth=auth)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from pprint import pprint\n", | |
"from hr import hr" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Function to check the credentials of the User\n", | |
"def verify():\n", | |
" verificationDetails = t.account.verify_credentials()\n", | |
" print \"Name: \", verificationDetails['name']\n", | |
" print \"Screen Name: \", verificationDetails['screen_name']\n", | |
" \n", | |
"verify()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Name: Rajat Goyal\n", | |
"Screen Name: rajat404\n" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"testTweet = t.statuses.home_timeline()[0]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h2>Sanitization</h2>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pprint(testTweet)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"{u'contributors': None,\n", | |
" u'coordinates': None,\n", | |
" u'created_at': u'Mon Dec 08 19:31:13 +0000 2014',\n", | |
" u'entities': {u'hashtags': [{u'indices': [34, 43], u'text': u'bettchat'}],\n", | |
" u'symbols': [],\n", | |
" u'urls': [{u'display_url': u'bit.ly/BettChat',\n", | |
" u'expanded_url': u'http://bit.ly/BettChat',\n", | |
" u'indices': [139, 140],\n", | |
" u'url': u'http://t.co/UlXwIj49TM'}],\n", | |
" u'user_mentions': [{u'id': 28354758,\n", | |
" u'id_str': u'28354758',\n", | |
" u'indices': [3, 13],\n", | |
" u'name': u'Bett',\n", | |
" u'screen_name': u'Bett_show'},\n", | |
" {u'id': 49793,\n", | |
" u'id_str': u'49793',\n", | |
" u'indices': [82, 94],\n", | |
" u'name': u'Jimmy Wales',\n", | |
" u'screen_name': u'jimmy_wales'},\n", | |
" {u'id': 86390214,\n", | |
" u'id_str': u'86390214',\n", | |
" u'indices': [107, 117],\n", | |
" u'name': u'Wikipedia ',\n", | |
" u'screen_name': u'Wikipedia'}]},\n", | |
" u'favorite_count': 0,\n", | |
" u'favorited': False,\n", | |
" u'geo': None,\n", | |
" u'id': 542038709655584768,\n", | |
" u'id_str': u'542038709655584768',\n", | |
" u'in_reply_to_screen_name': None,\n", | |
" u'in_reply_to_status_id': None,\n", | |
" u'in_reply_to_status_id_str': None,\n", | |
" u'in_reply_to_user_id': None,\n", | |
" u'in_reply_to_user_id_str': None,\n", | |
" u'lang': u'en',\n", | |
" u'place': None,\n", | |
" u'possibly_sensitive': False,\n", | |
" u'retweet_count': 2,\n", | |
" u'retweeted': False,\n", | |
" u'retweeted_status': {u'contributors': None,\n", | |
" u'coordinates': None,\n", | |
" u'created_at': u'Mon Dec 08 18:00:17 +0000 2014',\n", | |
" u'entities': {u'hashtags': [{u'indices': [19, 28],\n", | |
" u'text': u'bettchat'}],\n", | |
" u'symbols': [],\n", | |
" u'urls': [{u'display_url': u'bit.ly/BettChat',\n", | |
" u'expanded_url': u'http://bit.ly/BettChat',\n", | |
" u'indices': [104, 126],\n", | |
" u'url': u'http://t.co/UlXwIj49TM'}],\n", | |
" u'user_mentions': [{u'id': 49793,\n", | |
" u'id_str': u'49793',\n", | |
" u'indices': [67,\n", | |
" 79],\n", | |
" u'name': u'Jimmy Wales',\n", | |
" u'screen_name': u'jimmy_wales'},\n", | |
" {u'id': 86390214,\n", | |
" u'id_str': u'86390214',\n", | |
" u'indices': [92,\n", | |
" 102],\n", | |
" u'name': u'Wikipedia ',\n", | |
" u'screen_name': u'Wikipedia'}]},\n", | |
" u'favorite_count': 0,\n", | |
" u'favorited': False,\n", | |
" u'geo': None,\n", | |
" u'id': 542015823741603841,\n", | |
" u'id_str': u'542015823741603841',\n", | |
" u'in_reply_to_screen_name': None,\n", | |
" u'in_reply_to_status_id': None,\n", | |
" u'in_reply_to_status_id_str': None,\n", | |
" u'in_reply_to_user_id': None,\n", | |
" u'in_reply_to_user_id_str': None,\n", | |
" u'lang': u'en',\n", | |
" u'place': None,\n", | |
" u'possibly_sensitive': False,\n", | |
" u'retweet_count': 2,\n", | |
" u'retweeted': False,\n", | |
" u'source': u'<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>',\n", | |
" u'text': u'Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49TM',\n", | |
" u'truncated': False,\n", | |
" u'user': {u'contributors_enabled': False,\n", | |
" u'created_at': u'Thu Apr 02 15:13:29 +0000 2009',\n", | |
" u'default_profile': False,\n", | |
" u'default_profile_image': False,\n", | |
" u'description': u\"Bett, is the World's Largest Education Technology Show | Taking place on the 21-24th Jan 2015 #Bett2015 | Join our weekly #Bettchat Tues 4.30-5.30pm\",\n", | |
" u'entities': {u'description': {u'urls': []},\n", | |
" u'url': {u'urls': [{u'display_url': u'bettshow.com',\n", | |
" u'expanded_url': u'http://www.bettshow.com',\n", | |
" u'indices': [0,\n", | |
" 22],\n", | |
" u'url': u'http://t.co/oaInAp0mNz'}]}},\n", | |
" u'favourites_count': 814,\n", | |
" u'follow_request_sent': False,\n", | |
" u'followers_count': 16327,\n", | |
" u'following': False,\n", | |
" u'friends_count': 2967,\n", | |
" u'geo_enabled': False,\n", | |
" u'id': 28354758,\n", | |
" u'id_str': u'28354758',\n", | |
" u'is_translation_enabled': False,\n", | |
" u'is_translator': False,\n", | |
" u'lang': u'en',\n", | |
" u'listed_count': 467,\n", | |
" u'location': u'ExCeL, London',\n", | |
" u'name': u'Bett',\n", | |
" u'notifications': False,\n", | |
" u'profile_background_color': u'642D8B',\n", | |
" u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme10/bg.gif',\n", | |
" u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme10/bg.gif',\n", | |
" u'profile_background_tile': False,\n", | |
" u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/28354758/1414747808',\n", | |
" u'profile_image_url': u'http://pbs.twimg.com/profile_images/494061286565486592/L_CeIPiw_normal.png',\n", | |
" u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/494061286565486592/L_CeIPiw_normal.png',\n", | |
" u'profile_link_color': u'FF0000',\n", | |
" u'profile_location': None,\n", | |
" u'profile_sidebar_border_color': u'FFFFFF',\n", | |
" u'profile_sidebar_fill_color': u'7AC3EE',\n", | |
" u'profile_text_color': u'3D1957',\n", | |
" u'profile_use_background_image': False,\n", | |
" u'protected': False,\n", | |
" u'screen_name': u'Bett_show',\n", | |
" u'statuses_count': 9772,\n", | |
" u'time_zone': u'London',\n", | |
" u'url': u'http://t.co/oaInAp0mNz',\n", | |
" u'utc_offset': 0,\n", | |
" u'verified': False}},\n", | |
" u'source': u'<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>',\n", | |
" u'text': u'RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49\\u2026',\n", | |
" u'truncated': False,\n", | |
" u'user': {u'contributors_enabled': False,\n", | |
" u'created_at': u'Fri Dec 08 05:17:11 +0000 2006',\n", | |
" u'default_profile': True,\n", | |
" u'default_profile_image': False,\n", | |
" u'description': u'You know, the @Wikipedia and @Wikia guy. Free speech activist, entrepreneur.',\n", | |
" u'entities': {u'description': {u'urls': []},\n", | |
" u'url': {u'urls': [{u'display_url': u'facebook.com/pages/Jimmy-Wa\\u2026',\n", | |
" u'expanded_url': u'http://www.facebook.com/pages/Jimmy-Wales/10655515679',\n", | |
" u'indices': [0, 22],\n", | |
" u'url': u'http://t.co/RFNOTdr60k'}]}},\n", | |
" u'favourites_count': 88,\n", | |
" u'follow_request_sent': False,\n", | |
" u'followers_count': 103889,\n", | |
" u'following': True,\n", | |
" u'friends_count': 657,\n", | |
" u'geo_enabled': False,\n", | |
" u'id': 49793,\n", | |
" u'id_str': u'49793',\n", | |
" u'is_translation_enabled': False,\n", | |
" u'is_translator': False,\n", | |
" u'lang': u'en',\n", | |
" u'listed_count': 4330,\n", | |
" u'location': u'Florida and London',\n", | |
" u'name': u'Jimmy Wales',\n", | |
" u'notifications': False,\n", | |
" u'profile_background_color': u'C0DEED',\n", | |
" u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',\n", | |
" u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',\n", | |
" u'profile_background_tile': False,\n", | |
" u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/49793/1399594610',\n", | |
" u'profile_image_url': u'http://pbs.twimg.com/profile_images/15944612/small_sepia_jimbo_normal.jpg',\n", | |
" u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/15944612/small_sepia_jimbo_normal.jpg',\n", | |
" u'profile_link_color': u'0084B4',\n", | |
" u'profile_location': None,\n", | |
" u'profile_sidebar_border_color': u'C0DEED',\n", | |
" u'profile_sidebar_fill_color': u'DDEEF6',\n", | |
" u'profile_text_color': u'333333',\n", | |
" u'profile_use_background_image': True,\n", | |
" u'protected': False,\n", | |
" u'screen_name': u'jimmy_wales',\n", | |
" u'statuses_count': 8790,\n", | |
" u'time_zone': u'Pacific Time (US & Canada)',\n", | |
" u'url': u'http://t.co/RFNOTdr60k',\n", | |
" u'utc_offset': -28800,\n", | |
" u'verified': True}}\n" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sampleText = \"\"\"\n", | |
"Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia! http://bit.ly/BettChat\n", | |
"\"\"\"" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print sampleText" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia! http://bit.ly/BettChat\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print testTweet['user']['name']\n", | |
"print testTweet['user']['screen_name']\n", | |
"hr()\n", | |
"print \"\\nTimestamp:\", testTweet['created_at']\n", | |
"hr()\n", | |
"print \"\\nTweet Text:\", testTweet['text']\n", | |
"hr()\n", | |
"print \"URLs in Tweet:\"\n", | |
"pprint(testTweet['entities']['urls'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Jimmy Wales\n", | |
"jimmy_wales\n", | |
"####################################################################################################################################################################\n", | |
"\n", | |
"Timestamp: Mon Dec 08 19:31:13 +0000 2014\n", | |
"####################################################################################################################################################################\n", | |
"\n", | |
"Tweet Text: RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\n", | |
"http://t.co/UlXwIj49\u2026\n", | |
"####################################################################################################################################################################\n", | |
"URLs in Tweet:\n", | |
"[{u'display_url': u'bit.ly/BettChat',\n", | |
" u'expanded_url': u'http://bit.ly/BettChat',\n", | |
" u'indices': [139, 140],\n", | |
" u'url': u'http://t.co/UlXwIj49TM'}]\n" | |
] | |
} | |
], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sampleDict = {'text':testTweet['text'], 'urls':testTweet['entities']['urls'],\n", | |
" 'user':{\n", | |
" 'name':testTweet['user']['name'], \n", | |
" 'screen_name':testTweet['user']['screen_name']\n", | |
" }\n", | |
" }" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pprint(sampleDict)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"{'text': u'RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49\\u2026',\n", | |
" 'urls': [{u'display_url': u'bit.ly/BettChat',\n", | |
" u'expanded_url': u'http://bit.ly/BettChat',\n", | |
" u'indices': [139, 140],\n", | |
" u'url': u'http://t.co/UlXwIj49TM'}],\n", | |
" 'user': {'name': u'Jimmy Wales', 'screen_name': u'jimmy_wales'}}\n" | |
] | |
} | |
], | |
"prompt_number": 40 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h2>Data Cleaning</h2>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from string import punctuation\n", | |
"set_punct = set(punctuation) # need set for fast 'in' operator\n", | |
"\n", | |
"# If we wanted to keep the special twitter letters we could just do:\n", | |
"# set_punct = set_punct - {\"#\", \"@\"}\n", | |
"set_punct = set_punct - {\"_\", \"@\"}\n", | |
"\n", | |
"def sanitize(text, set_excludes):\n", | |
" \"\"\"Return a `sanitized` version of the string `text`. Characters\n", | |
" in `set_excludes` are removed.\n", | |
" \n", | |
" Specifically, We:\n", | |
" 1. Replace any characters in set_excludes with spaces, \n", | |
" 2. Convert uppercase letters to lowercase, \n", | |
" 3. Remove one-letter words. \n", | |
" 4. Remove words containing \"http://\"\n", | |
" \n", | |
" For example:\n", | |
" \"John's car\" -> \"john s car\" -> \"john car\"\n", | |
" assuming punctuation is to be excluded.\n", | |
" \"\"\"\n", | |
" \n", | |
" text = text.lower()\n", | |
" \n", | |
" # split into words to remove hyperlinks, then join back into string:\n", | |
" text = \" \".join([ w for w in text.split() if not (\"http://\" in w) ])\n", | |
" \n", | |
" # filter bad letters (this uses a python ternary statement):\n", | |
" letters_noPunct = [ (\" \" if c in set_excludes else c) for c in text ]\n", | |
" \n", | |
" # Join letters into string, then split into words to \n", | |
" # remove one-letter words\n", | |
" text = \"\".join(letters_noPunct)\n", | |
" words = text.split()\n", | |
" long_enuf_words = [w.strip() for w in words if len(w)>1]\n", | |
" \n", | |
" return \" \".join(long_enuf_words) # spaces between words" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print set_punct" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"set(['!', '#', '\"', '%', '$', \"'\", '&', ')', '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', '<', '?', '>', '[', ']', '\\\\', '^', '`', '{', '}', '|', '~'])\n" | |
] | |
} | |
], | |
"prompt_number": 72 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cleanTweet = sanitize(testTweet['text'], set_punct)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 73 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print testTweet['text']\n", | |
"hr()\n", | |
"print cleanTweet" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\n", | |
"http://t.co/UlXwIj49\u2026\n", | |
"####################################################################################################################################################################\n", | |
"rt @bett_show make sure you join bettchat tomorrow 30pm gmt with our host @jimmy_wales founder of @wikipedia\n" | |
] | |
} | |
], | |
"prompt_number": 74 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hr()\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"####################################################################################################################################################################\n" | |
] | |
} | |
], | |
"prompt_number": 75 | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"import itertools\n", | |
"twiterator = itertools.chain.from_iterable(\n", | |
" itertools.repeat(\n", | |
" t.statuses.home_timeline()))" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment