Skip to content

Instantly share code, notes, and snippets.

@rajat404
Created December 9, 2014 05:29
Show Gist options
  • Save rajat404/c11df37d43f9b1020d47 to your computer and use it in GitHub Desktop.
Save rajat404/c11df37d43f9b1020d47 to your computer and use it in GitHub Desktop.
dedup project-phase1
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<center><h1><u><b>Textual Analysis for Detection & Removal of Duplicates</b></u></h1></center>"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"<h2><center>About</center></h2>\n",
"<li>The aim of this project is to find and remove duplicate or near-duplicate from text\n",
"<li>Here we are taking the specific case of tweets (from Twitter)\n",
"<li>This project is aimed to reduce the amount of redundant data we see across the internet, primarily to converse time\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Authentication</h2>\n",
"We shall use the access token and API secrets in the file keys.txt"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"import twitter\n",
"authval = json.load(open(\"keys.txt\"))\n",
"CONSUMER_KEY = authval['CONSUMER_KEY']\n",
"CONSUMER_SECRET = authval['CONSUMER_SECRET']\n",
"OAUTH_TOKEN = authval['OAUTH_TOKEN'] \n",
"OAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n",
"\n",
"auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n",
" CONSUMER_KEY, CONSUMER_SECRET)\n",
"\n",
"t = twitter.Twitter(auth=auth)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from hr import hr"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Function to check the credentials of the User\n",
"def verify():\n",
" verificationDetails = t.account.verify_credentials()\n",
" print \"Name: \", verificationDetails['name']\n",
" print \"Screen Name: \", verificationDetails['screen_name']\n",
" \n",
"verify()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Name: Rajat Goyal\n",
"Screen Name: rajat404\n"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"testTweet = t.statuses.home_timeline()[0]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Sanitization</h2>"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pprint(testTweet)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{u'contributors': None,\n",
" u'coordinates': None,\n",
" u'created_at': u'Mon Dec 08 19:31:13 +0000 2014',\n",
" u'entities': {u'hashtags': [{u'indices': [34, 43], u'text': u'bettchat'}],\n",
" u'symbols': [],\n",
" u'urls': [{u'display_url': u'bit.ly/BettChat',\n",
" u'expanded_url': u'http://bit.ly/BettChat',\n",
" u'indices': [139, 140],\n",
" u'url': u'http://t.co/UlXwIj49TM'}],\n",
" u'user_mentions': [{u'id': 28354758,\n",
" u'id_str': u'28354758',\n",
" u'indices': [3, 13],\n",
" u'name': u'Bett',\n",
" u'screen_name': u'Bett_show'},\n",
" {u'id': 49793,\n",
" u'id_str': u'49793',\n",
" u'indices': [82, 94],\n",
" u'name': u'Jimmy Wales',\n",
" u'screen_name': u'jimmy_wales'},\n",
" {u'id': 86390214,\n",
" u'id_str': u'86390214',\n",
" u'indices': [107, 117],\n",
" u'name': u'Wikipedia ',\n",
" u'screen_name': u'Wikipedia'}]},\n",
" u'favorite_count': 0,\n",
" u'favorited': False,\n",
" u'geo': None,\n",
" u'id': 542038709655584768,\n",
" u'id_str': u'542038709655584768',\n",
" u'in_reply_to_screen_name': None,\n",
" u'in_reply_to_status_id': None,\n",
" u'in_reply_to_status_id_str': None,\n",
" u'in_reply_to_user_id': None,\n",
" u'in_reply_to_user_id_str': None,\n",
" u'lang': u'en',\n",
" u'place': None,\n",
" u'possibly_sensitive': False,\n",
" u'retweet_count': 2,\n",
" u'retweeted': False,\n",
" u'retweeted_status': {u'contributors': None,\n",
" u'coordinates': None,\n",
" u'created_at': u'Mon Dec 08 18:00:17 +0000 2014',\n",
" u'entities': {u'hashtags': [{u'indices': [19, 28],\n",
" u'text': u'bettchat'}],\n",
" u'symbols': [],\n",
" u'urls': [{u'display_url': u'bit.ly/BettChat',\n",
" u'expanded_url': u'http://bit.ly/BettChat',\n",
" u'indices': [104, 126],\n",
" u'url': u'http://t.co/UlXwIj49TM'}],\n",
" u'user_mentions': [{u'id': 49793,\n",
" u'id_str': u'49793',\n",
" u'indices': [67,\n",
" 79],\n",
" u'name': u'Jimmy Wales',\n",
" u'screen_name': u'jimmy_wales'},\n",
" {u'id': 86390214,\n",
" u'id_str': u'86390214',\n",
" u'indices': [92,\n",
" 102],\n",
" u'name': u'Wikipedia ',\n",
" u'screen_name': u'Wikipedia'}]},\n",
" u'favorite_count': 0,\n",
" u'favorited': False,\n",
" u'geo': None,\n",
" u'id': 542015823741603841,\n",
" u'id_str': u'542015823741603841',\n",
" u'in_reply_to_screen_name': None,\n",
" u'in_reply_to_status_id': None,\n",
" u'in_reply_to_status_id_str': None,\n",
" u'in_reply_to_user_id': None,\n",
" u'in_reply_to_user_id_str': None,\n",
" u'lang': u'en',\n",
" u'place': None,\n",
" u'possibly_sensitive': False,\n",
" u'retweet_count': 2,\n",
" u'retweeted': False,\n",
" u'source': u'<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>',\n",
" u'text': u'Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49TM',\n",
" u'truncated': False,\n",
" u'user': {u'contributors_enabled': False,\n",
" u'created_at': u'Thu Apr 02 15:13:29 +0000 2009',\n",
" u'default_profile': False,\n",
" u'default_profile_image': False,\n",
" u'description': u\"Bett, is the World's Largest Education Technology Show | Taking place on the 21-24th Jan 2015 #Bett2015 | Join our weekly #Bettchat Tues 4.30-5.30pm\",\n",
" u'entities': {u'description': {u'urls': []},\n",
" u'url': {u'urls': [{u'display_url': u'bettshow.com',\n",
" u'expanded_url': u'http://www.bettshow.com',\n",
" u'indices': [0,\n",
" 22],\n",
" u'url': u'http://t.co/oaInAp0mNz'}]}},\n",
" u'favourites_count': 814,\n",
" u'follow_request_sent': False,\n",
" u'followers_count': 16327,\n",
" u'following': False,\n",
" u'friends_count': 2967,\n",
" u'geo_enabled': False,\n",
" u'id': 28354758,\n",
" u'id_str': u'28354758',\n",
" u'is_translation_enabled': False,\n",
" u'is_translator': False,\n",
" u'lang': u'en',\n",
" u'listed_count': 467,\n",
" u'location': u'ExCeL, London',\n",
" u'name': u'Bett',\n",
" u'notifications': False,\n",
" u'profile_background_color': u'642D8B',\n",
" u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme10/bg.gif',\n",
" u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme10/bg.gif',\n",
" u'profile_background_tile': False,\n",
" u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/28354758/1414747808',\n",
" u'profile_image_url': u'http://pbs.twimg.com/profile_images/494061286565486592/L_CeIPiw_normal.png',\n",
" u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/494061286565486592/L_CeIPiw_normal.png',\n",
" u'profile_link_color': u'FF0000',\n",
" u'profile_location': None,\n",
" u'profile_sidebar_border_color': u'FFFFFF',\n",
" u'profile_sidebar_fill_color': u'7AC3EE',\n",
" u'profile_text_color': u'3D1957',\n",
" u'profile_use_background_image': False,\n",
" u'protected': False,\n",
" u'screen_name': u'Bett_show',\n",
" u'statuses_count': 9772,\n",
" u'time_zone': u'London',\n",
" u'url': u'http://t.co/oaInAp0mNz',\n",
" u'utc_offset': 0,\n",
" u'verified': False}},\n",
" u'source': u'<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>',\n",
" u'text': u'RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49\\u2026',\n",
" u'truncated': False,\n",
" u'user': {u'contributors_enabled': False,\n",
" u'created_at': u'Fri Dec 08 05:17:11 +0000 2006',\n",
" u'default_profile': True,\n",
" u'default_profile_image': False,\n",
" u'description': u'You know, the @Wikipedia and @Wikia guy. Free speech activist, entrepreneur.',\n",
" u'entities': {u'description': {u'urls': []},\n",
" u'url': {u'urls': [{u'display_url': u'facebook.com/pages/Jimmy-Wa\\u2026',\n",
" u'expanded_url': u'http://www.facebook.com/pages/Jimmy-Wales/10655515679',\n",
" u'indices': [0, 22],\n",
" u'url': u'http://t.co/RFNOTdr60k'}]}},\n",
" u'favourites_count': 88,\n",
" u'follow_request_sent': False,\n",
" u'followers_count': 103889,\n",
" u'following': True,\n",
" u'friends_count': 657,\n",
" u'geo_enabled': False,\n",
" u'id': 49793,\n",
" u'id_str': u'49793',\n",
" u'is_translation_enabled': False,\n",
" u'is_translator': False,\n",
" u'lang': u'en',\n",
" u'listed_count': 4330,\n",
" u'location': u'Florida and London',\n",
" u'name': u'Jimmy Wales',\n",
" u'notifications': False,\n",
" u'profile_background_color': u'C0DEED',\n",
" u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',\n",
" u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',\n",
" u'profile_background_tile': False,\n",
" u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/49793/1399594610',\n",
" u'profile_image_url': u'http://pbs.twimg.com/profile_images/15944612/small_sepia_jimbo_normal.jpg',\n",
" u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/15944612/small_sepia_jimbo_normal.jpg',\n",
" u'profile_link_color': u'0084B4',\n",
" u'profile_location': None,\n",
" u'profile_sidebar_border_color': u'C0DEED',\n",
" u'profile_sidebar_fill_color': u'DDEEF6',\n",
" u'profile_text_color': u'333333',\n",
" u'profile_use_background_image': True,\n",
" u'protected': False,\n",
" u'screen_name': u'jimmy_wales',\n",
" u'statuses_count': 8790,\n",
" u'time_zone': u'Pacific Time (US & Canada)',\n",
" u'url': u'http://t.co/RFNOTdr60k',\n",
" u'utc_offset': -28800,\n",
" u'verified': True}}\n"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sampleText = \"\"\"\n",
"Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia! http://bit.ly/BettChat\n",
"\"\"\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print sampleText"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia! http://bit.ly/BettChat\n",
"\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print testTweet['user']['name']\n",
"print testTweet['user']['screen_name']\n",
"hr()\n",
"print \"\\nTimestamp:\", testTweet['created_at']\n",
"hr()\n",
"print \"\\nTweet Text:\", testTweet['text']\n",
"hr()\n",
"print \"URLs in Tweet:\"\n",
"pprint(testTweet['entities']['urls'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Jimmy Wales\n",
"jimmy_wales\n",
"####################################################################################################################################################################\n",
"\n",
"Timestamp: Mon Dec 08 19:31:13 +0000 2014\n",
"####################################################################################################################################################################\n",
"\n",
"Tweet Text: RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\n",
"http://t.co/UlXwIj49\u2026\n",
"####################################################################################################################################################################\n",
"URLs in Tweet:\n",
"[{u'display_url': u'bit.ly/BettChat',\n",
" u'expanded_url': u'http://bit.ly/BettChat',\n",
" u'indices': [139, 140],\n",
" u'url': u'http://t.co/UlXwIj49TM'}]\n"
]
}
],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sampleDict = {'text':testTweet['text'], 'urls':testTweet['entities']['urls'],\n",
" 'user':{\n",
" 'name':testTweet['user']['name'], \n",
" 'screen_name':testTweet['user']['screen_name']\n",
" }\n",
" }"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pprint(sampleDict)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'text': u'RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\\nhttp://t.co/UlXwIj49\\u2026',\n",
" 'urls': [{u'display_url': u'bit.ly/BettChat',\n",
" u'expanded_url': u'http://bit.ly/BettChat',\n",
" u'indices': [139, 140],\n",
" u'url': u'http://t.co/UlXwIj49TM'}],\n",
" 'user': {'name': u'Jimmy Wales', 'screen_name': u'jimmy_wales'}}\n"
]
}
],
"prompt_number": 40
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Data Cleaning</h2>"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from string import punctuation\n",
"set_punct = set(punctuation) # need set for fast 'in' operator\n",
"\n",
"# If we wanted to keep the special twitter letters we could just do:\n",
"# set_punct = set_punct - {\"#\", \"@\"}\n",
"set_punct = set_punct - {\"_\", \"@\"}\n",
"\n",
"def sanitize(text, set_excludes):\n",
" \"\"\"Return a `sanitized` version of the string `text`. Characters\n",
" in `set_excludes` are removed.\n",
" \n",
" Specifically, We:\n",
" 1. Replace any characters in set_excludes with spaces, \n",
" 2. Convert uppercase letters to lowercase, \n",
" 3. Remove one-letter words. \n",
" 4. Remove words containing \"http://\"\n",
" \n",
" For example:\n",
" \"John's car\" -> \"john s car\" -> \"john car\"\n",
" assuming punctuation is to be excluded.\n",
" \"\"\"\n",
" \n",
" text = text.lower()\n",
" \n",
" # split into words to remove hyperlinks, then join back into string:\n",
" text = \" \".join([ w for w in text.split() if not (\"http://\" in w) ])\n",
" \n",
" # filter bad letters (this uses a python ternary statement):\n",
" letters_noPunct = [ (\" \" if c in set_excludes else c) for c in text ]\n",
" \n",
" # Join letters into string, then split into words to \n",
" # remove one-letter words\n",
" text = \"\".join(letters_noPunct)\n",
" words = text.split()\n",
" long_enuf_words = [w.strip() for w in words if len(w)>1]\n",
" \n",
" return \" \".join(long_enuf_words) # spaces between words"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 71
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print set_punct"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"set(['!', '#', '\"', '%', '$', \"'\", '&', ')', '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', '<', '?', '>', '[', ']', '\\\\', '^', '`', '{', '}', '|', '~'])\n"
]
}
],
"prompt_number": 72
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cleanTweet = sanitize(testTweet['text'], set_punct)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 73
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print testTweet['text']\n",
"hr()\n",
"print cleanTweet"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"RT @Bett_show: Make sure you join #bettchat tomorrow @ 4:30pm (GMT) with our host @jimmy_wales, founder of @Wikipedia!\n",
"http://t.co/UlXwIj49\u2026\n",
"####################################################################################################################################################################\n",
"rt @bett_show make sure you join bettchat tomorrow 30pm gmt with our host @jimmy_wales founder of @wikipedia\n"
]
}
],
"prompt_number": 74
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hr()\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"####################################################################################################################################################################\n"
]
}
],
"prompt_number": 75
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"import itertools\n",
"twiterator = itertools.chain.from_iterable(\n",
" itertools.repeat(\n",
" t.statuses.home_timeline()))"
]
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment