Skip to content

Instantly share code, notes, and snippets.

@AdityaSoni19031997
Created March 6, 2018 12:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AdityaSoni19031997/e826a738faada01d3e293bcd6012fd06 to your computer and use it in GitHub Desktop.
Save AdityaSoni19031997/e826a738faada01d3e293bcd6012fd06 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-06T11:57:15.630857Z",
"start_time": "2018-03-06T11:57:15.574862Z"
}
},
"outputs": [],
"source": [
"l = []\n",
"def preprocess(path_to_inp_json_file, path_to_out_txt_file):\n",
" with open(path_to_inp_json_file) as inp_file, \\\n",
" open(path_to_out_txt_file, 'w',encoding=\"utf-8\") as out_file:\n",
" for line in tqdm_notebook(inp_file):\n",
" json_data = json.loads(line)\n",
" content = json_data['content'].replace('\\n', ' ').replace('\\r', ' ')\n",
" l.append(json_data)\n",
" return json_data\n",
" content_no_html_tags = strip_tags(content)\n",
" out_file.write(content_no_html_tags + '\\n')\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-06T11:57:19.281902Z",
"start_time": "2018-03-06T11:57:18.901726Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5b73c46f17cd46fba7c778d807ca7013",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"A Jupyter Widget"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Wall time: 353 ms\n"
]
}
],
"source": [
"%%time\n",
"jsonofabitch = preprocess(path_to_inp_json_file=os.path.join(PATH_TO_RAW_DATA, 'train.json'),\n",
" path_to_out_txt_file=os.path.join(PATH_TO_PROCESSED_DATA, 'train_raw_content.txt'))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-06T11:57:20.966821Z",
"start_time": "2018-03-06T11:57:20.930821Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'_id': 'https://medium.com/policy/medium-terms-of-service-9db0094a1e0f',\n",
" '_spider': 'medium',\n",
" '_timestamp': 1520035195.282891,\n",
" 'author': {'name': None,\n",
" 'twitter': '@Medium',\n",
" 'url': 'https://medium.com/@Medium'},\n",
" 'content': '<div><header class=\"container u-maxWidth740\"><div class=\"uiScale uiScale-ui--regular uiScale-caption--regular postMetaHeader u-paddingBottom10 row\"><div class=\"col u-size12of12 js-postMetaLockup\"><div class=\"uiScale uiScale-ui--regular uiScale-caption--regular postMetaLockup postMetaLockup--authorWithBio u-flexCenter js-postMetaLockup\"><div class=\"u-flex0\"><a class=\"link u-baseColor--link avatar\" href=\"https://medium.com/@Medium?source=post_header_lockup\" data-action=\"show-user-card\" data-action-source=\"post_header_lockup\" data-action-value=\"504c7870fdb6\" data-action-type=\"hover\" data-user-id=\"504c7870fdb6\" dir=\"auto\"><div class=\"u-relative u-inlineBlock u-flex0\"><img src=\"https://cdn-images-1.medium.com/fit/c/120/120/1*6_fgYnisCa9V21mymySIvA.png\" class=\"avatar-image avatar-image--small\" alt=\"Go to the profile of Medium\"><div class=\"avatar-halo u-absolute u-textColorGreenNormal svgIcon\" style=\"width: calc(100% + 12px); height: calc(100% + 12px); top:-6px; left:-6px\"><svg viewbox=\"0 0 114 114\" xmlns=\"http://www.w3.org/2000/svg\"><path d=\"M7.66922967,32.092726 C17.0070768,13.6353618 35.9421928,1.75 57,1.75 C78.0578072,1.75 96.9929232,13.6353618 106.33077,32.092726 L107.66923,31.4155801 C98.0784505,12.4582656 78.6289015,0.25 57,0.25 C35.3710985,0.25 15.9215495,12.4582656 6.33077033,31.4155801 L7.66922967,32.092726 Z\"></path><path d=\"M106.33077,81.661427 C96.9929232,100.118791 78.0578072,112.004153 57,112.004153 C35.9421928,112.004153 17.0070768,100.118791 7.66922967,81.661427 L6.33077033,82.338573 C15.9215495,101.295887 35.3710985,113.504153 57,113.504153 C78.6289015,113.504153 98.0784505,101.295887 107.66923,82.338573 L106.33077,81.661427 Z\"></path></svg></div></div></a></div><div class=\"u-flex1 u-paddingLeft15 u-overflowHidden\"><div class=\"u-lineHeightTightest\"><a class=\"ds-link ds-link--styleSubtle ui-captionStrong u-inlineBlock link link--darken link--darker\" href=\"https://medium.com/@Medium?source=post_header_lockup\" data-action=\"show-user-card\" data-action-source=\"post_header_lockup\" data-action-value=\"504c7870fdb6\" data-action-type=\"hover\" data-user-id=\"504c7870fdb6\" dir=\"auto\">Medium</a><span class=\"followState js-followState\" data-user-id=\"504c7870fdb6\"></span></div><div class=\"ui-caption ui-xs-clamp2 postMetaInline\">Everyone’s stories and ideas</div><div class=\"ui-caption postMetaInline js-testPostMetaInlineSupplemental\"><time datetime=\"2012-08-13T22:54:53.510Z\">Aug 13, 2012</time><span class=\"middotDivider u-fontSize12\"></span><span class=\"readingTime\" title=\"5 min read\"></span></div></div></div></div></div></header><div class=\"postArticle-content js-postField js-notesSource js-trackedPost\" data-post-id=\"9db0094a1e0f\" data-source=\"post_page\" data-collection-id=\"675ebe56ac25\" data-tracking-context=\"postPage\"><section name=\"bb8c\" class=\"section section--body section--first section--last\"><div class=\"section-divider\"><hr class=\"section-divider\"></div><div class=\"section-content\"><div class=\"section-inner sectionLayout--insetColumn\"><h1 name=\"title\" id=\"title\" class=\"graf graf--h2 graf--leading graf--title\">Medium Terms of\\xa0Service</h1><p name=\"571b\" id=\"571b\" class=\"graf graf--p graf-after--h2\"><strong class=\"markup--strong markup--p-strong\">Effective: March 7, 2016</strong></p><p name=\"c90b\" id=\"c90b\" class=\"graf graf--p graf-after--p\">These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”).</p><p name=\"238b\" id=\"238b\" class=\"graf graf--p graf-after--p\">By using Medium, you agree to these Terms. If you don’t agree to any of the Terms, you can’t use Medium.</p><p name=\"7769\" id=\"7769\" class=\"graf graf--p graf-after--p\">We can change these Terms at any time. We keep a <a href=\"https://github.com/Medium/medium-policy\" data-href=\"https://github.com/Medium/medium-policy\" class=\"markup--anchor markup--p-anchor\" rel=\"nofollow noopener\" target=\"_blank\">historical</a> record of all changes to our Terms on GitHub. If a change is material, we’ll let you know before they take effect. By using Medium on or after that effective date, you agree to the new Terms. If you don’t agree to them, you should delete your account before they take effect, otherwise your use of the site and content will be subject to the new Terms.</p><h4 name=\"8c81\" id=\"8c81\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">Content rights &amp; responsibilities</strong></h4><p name=\"ac74\" id=\"ac74\" class=\"graf graf--p graf-after--h4\">You own the rights to the content you create and post on Medium.</p><p name=\"651b\" id=\"651b\" class=\"graf graf--p graf-after--p\">By posting content to Medium, you give us a nonexclusive license to publish it on Medium Services, including anything reasonably related to publishing it (like storing, displaying, reformatting, and distributing it). In consideration for Medium granting you access to and use of the Services, you agree that Medium may enable advertising on the Services, including in connection with the display of your content or other information. We may also use your content to promote Medium, including its products and content. We will never sell your content to third parties without your explicit permission.</p><p name=\"2584\" id=\"2584\" class=\"graf graf--p graf-after--p\">You’re responsible for the content you post. This means you assume all risks related to it, including someone else’s reliance on its accuracy, or claims relating to intellectual property or other legal rights.</p><p name=\"c207\" id=\"c207\" class=\"graf graf--p graf-after--p\">You’re welcome to post content on Medium that you’ve published elsewhere, as long as you have the rights you need to do so. By posting content to Medium, you represent that doing so doesn’t conflict with any other agreement you’ve made.</p><p name=\"0372\" id=\"0372\" class=\"graf graf--p graf-after--p\">By posting content you didn’t create to Medium, you are representing that you have the right to do so. For example, you are posting a work that’s in the public domain, used under license (including a free license, such as <a href=\"https://creativecommons.org/licenses/\" data-href=\"https://creativecommons.org/licenses/\" class=\"markup--anchor markup--p-anchor\" rel=\"nofollow noopener\" target=\"_blank\">Creative Commons</a>), or a fair use.</p><p name=\"0472\" id=\"0472\" class=\"graf graf--p graf-after--p\">We can remove any content you post for any reason.</p><p name=\"db2b\" id=\"db2b\" class=\"graf graf--p graf-after--p\">You can delete any of your posts, or your account, anytime. Processing the deletion may take a little time, but we’ll do it as quickly as possible. We may keep backup copies of your deleted post or account on our servers for up to 14 days after you delete it.</p><h4 name=\"baf1\" id=\"baf1\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">Our content and\\xa0services</strong></h4><p name=\"adc7\" id=\"adc7\" class=\"graf graf--p graf-after--h4\">We reserve all rights in Medium’s look and feel. Some parts of Medium are licensed under third-party open source licenses. We also make some of our own code available under open source licenses. As for other parts of Medium, you may not copy or adapt any portion of our code or visual design elements (including logos) without express written permission from Medium unless otherwise permitted by law.</p><p name=\"20e4\" id=\"20e4\" class=\"graf graf--p graf-after--p\">You may not do, or try to do, the following: (1) access or tamper with non-public areas of the Services, our computer systems, or the systems of our technical providers; (2) access or search the Services by any means other than the currently available, published interfaces (e.g., APIs) that we provide; (3) forge any TCP/IP packet header or any part of the header information in any email or posting, or in any way use the Services to send altered, deceptive, or false source-identifying information; or (4) interfere with, or disrupt, the access of any user, host, or network, including sending a virus, overloading, flooding, spamming, mail-bombing the Services, or by scripting the creation of content or accounts in such a manner as to interfere with or create an undue burden on the Services.</p><p name=\"f5dd\" id=\"f5dd\" class=\"graf graf--p graf-after--p\">Crawling the Services is allowed if done in accordance with the provisions of our robots.txt file, but scraping the Services is prohibited.</p><p name=\"71a8\" id=\"71a8\" class=\"graf graf--p graf-after--p\">We may change, terminate, or restrict access to any aspect of the service, at any time, without notice.</p><h4 name=\"12f1\" id=\"12f1\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">No children</strong></h4><p name=\"2ce7\" id=\"2ce7\" class=\"graf graf--p graf-after--h4\">Medium is only for people 13 years old and over. By using Medium, you affirm that you are over 13. If we learn someone under 13 is using Medium, we’ll terminate their account.</p><h4 name=\"531c\" id=\"531c\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">Security</strong></h4><p name=\"3155\" id=\"3155\" class=\"graf graf--p graf-after--h4\">If you find a security vulnerability on Medium, tell us. We have a <a href=\"https://medium.com/policy/medium-s-bug-bounty-disclosure-program-34b1c80764c2\" data-href=\"https://medium.com/policy/medium-s-bug-bounty-disclosure-program-34b1c80764c2\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">bug bounty disclosure program</a>.</p><h4 name=\"05cc\" id=\"05cc\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">Incorporated rules and\\xa0policies</strong></h4><p name=\"5207\" id=\"5207\" class=\"graf graf--p graf-after--h4\">By using the Services, you agree to let Medium collect and use information as detailed in our <a href=\"https://medium.com/p/f03bf92035c9\" data-href=\"https://medium.com/p/f03bf92035c9\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">Privacy Policy</a>. If you’re outside the United States, you consent to letting Medium transfer, store, and process your information (including your personal information and content) in and out of the United States.</p><p name=\"6230\" id=\"6230\" class=\"graf graf--p graf-after--p\">To enable a functioning community, we have <a href=\"https://medium.com/policy/medium-rules-30e5502c4eb4\" data-href=\"https://medium.com/policy/medium-rules-30e5502c4eb4\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">Rules</a>. To ensure usernames are distributed and used fairly, we have a <a href=\"https://medium.com/@Medium/medium-username-policy-7054a77fb04f\" data-href=\"https://medium.com/@Medium/medium-username-policy-7054a77fb04f\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">Username Policy</a>. Under our <a href=\"https://medium.com/policy/mediums-copyright-and-dmca-policy-d126f73695\" data-href=\"https://medium.com/policy/mediums-copyright-and-dmca-policy-d126f73695\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">DMCA Policy</a>, we’ll remove material after receiving a valid takedown notice. Under our <a href=\"https://medium.com/policy/mediums-trademark-policy-e3bb53df59a7\" data-href=\"https://medium.com/policy/mediums-trademark-policy-e3bb53df59a7\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">Trademark Policy</a>, we’ll investigate any use of another’s trademark and respond appropriately.</p><p name=\"21ad\" id=\"21ad\" class=\"graf graf--p graf-after--p\">By using Medium, you agree to follow these Rules and Policies. If you don’t, we may remove content, or suspend or delete your account.</p><h4 name=\"a2a2\" id=\"a2a2\" class=\"graf graf--h4 graf-after--p\"><strong class=\"markup--strong markup--h4-strong\">Miscellaneous</strong></h4><p name=\"b7da\" id=\"b7da\" class=\"graf graf--p graf-after--h4\"><em class=\"markup--em markup--p-em\">Disclaimer of warranty.</em> Medium provides the Services to you as is. You use them at your own risk and discretion. That means they don’t come with any warranty. None express, none implied. No implied warranty of merchantability, fitness for a particular purpose, availability, security, title or non-infringement.</p><p name=\"7073\" id=\"7073\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">Limitation of Liability</em>. Medium won’t be liable to you for any damages that arise from your using the Services. This includes if the Services are hacked or unavailable. This includes all types of damages (indirect, incidental, consequential, special or exemplary). And it includes all kinds of legal claims, such as breach of contract, breach of warranty, tort, or any other loss.</p><p name=\"3d70\" id=\"3d70\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">No waiver.</em> If Medium doesn’t exercise a particular right under these Terms, that doesn’t waive it.</p><p name=\"ab04\" id=\"ab04\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">Severability</em>. If any provision of these terms is found invalid by a court of competent jurisdiction, you agree that the court should try to give effect to the parties’ intentions as reflected in the provision and that other provisions of the Terms will remain in full effect.</p><p name=\"bde8\" id=\"bde8\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">Choice of law and jurisdiction.</em> These Terms are governed by California law, without reference to its conflict of laws provisions. You agree that any suit arising from the Services must take place in a court located in San Francisco, California.</p><p name=\"bbb3\" id=\"bbb3\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">Entire agreement.</em> These Terms (including any document incorporated by reference into them) are the whole agreement between Medium and you concerning the Services.</p><p name=\"dbf1\" id=\"dbf1\" class=\"graf graf--p graf-after--p\"><em class=\"markup--em markup--p-em\">Government use.</em> If you’re \\u200busing \\u200bMedium for the U.S. Government, <a href=\"https://medium.com/@Medium/amendment-to-medium-terms-of-service-applicable-to-u-s-government-users-fccb00db67d7\" data-href=\"https://medium.com/@Medium/amendment-to-medium-terms-of-service-applicable-to-u-s-government-users-fccb00db67d7\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">this Amendment</a> to \\u200bMedium’s Terms of Service \\u200bapplies to you\\u200b.</p><p name=\"3318\" id=\"3318\" class=\"graf graf--p graf-after--p graf--trailing\">Questions? Let us know at <a href=\"mailto:%20legal@medium.com\" data-href=\"mailto:%20legal@medium.com\" class=\"markup--anchor markup--p-anchor\" target=\"_blank\">legal@medium.com</a>.</p></div></div></section></div><footer class=\"u-paddingTop10\"><div class=\"container u-maxWidth740\"><div class=\"row\"><div class=\"col u-size12of12\"></div></div><div class=\"row\"><div class=\"col u-size12of12 js-postTags\"><div class=\"u-paddingBottom10\"><ul class=\"tags tags--postTags tags--borderless\"><li><a class=\"link u-baseColor--link\" href=\"https://medium.com/tag/terms-and-conditions?source=post\" data-action-source=\"post\">Terms And Conditions</a></li><li><a class=\"link u-baseColor--link\" href=\"https://medium.com/tag/terms?source=post\" data-action-source=\"post\">Terms</a></li><li><a class=\"link u-baseColor--link\" href=\"https://medium.com/tag/medium?source=post\" data-action-source=\"post\">Medium</a></li></ul></div></div></div><section class=\"uiScale uiScale-ui--small uiScale-caption--regular u-borderTopLightest u-marginTop10 u-paddingTop20\"><div class=\"ui-h3 u-textColorDarker u-fontSize22\">One clap, two clap, three clap, forty?</div><p class=\"ui-body u-marginBottom20 u-textColorDark u-fontSize16\">By clapping more or less, you can signal to us which stories really stand out.</p></section><div class=\"postActions js-postActionsFooter\"><div class=\"u-flexCenter\"><div class=\"u-flex1\"><div class=\"multirecommend js-actionMultirecommend u-flexCenter u-width60\" data-post-id=\"9db0094a1e0f\" data-is-icon-29px=\"true\" data-is-circle=\"true\" data-has-recommend-list=\"true\" data-source=\"post_actions_footer-----9db0094a1e0f---------------------clap_footer\"><div class=\"u-relative u-foreground\"><div class=\"clapUndo u-width60 u-round u-height32 u-absolute u-borderBox u-paddingRight5 u-transition--transform200Spring u-background--brandSageLighter js-clapUndo\" style=\"top: 14px; padding: 2px;\"></div></div><span class=\"u-textAlignCenter u-relative u-background js-actionMultirecommendCount u-marginLeft10\"></span></div></div><div class=\"buttonSet u-flex0\"></div></div></div></div><div class=\"u-maxWidth740 u-paddingTop20 u-marginTop20 u-borderTopLightest container u-paddingBottom20 u-xs-paddingBottom10 js-postAttributionFooterContainer\"><div class=\"row js-postFooterInfo\"><div class=\"col u-size6of12 u-xs-size12of12\"><li class=\"uiScale uiScale-ui--small uiScale-caption--regular u-block u-paddingBottom18 js-cardUser\"><div class=\"u-marginLeft20 u-floatRight\"><span class=\"followState js-followState\" data-user-id=\"504c7870fdb6\"></span></div><div class=\"u-tableCell\"><a class=\"link u-baseColor--link avatar\" href=\"https://medium.com/@Medium?source=footer_card\" title=\"Go to the profile of Medium\" aria-label=\"Go to the profile of Medium\" data-action-source=\"footer_card\" data-user-id=\"504c7870fdb6\" dir=\"auto\"><div class=\"u-relative u-inlineBlock u-flex0\"><img src=\"https://cdn-images-1.medium.com/fit/c/120/120/1*6_fgYnisCa9V21mymySIvA.png\" class=\"avatar-image avatar-image--small\" alt=\"Go to the profile of Medium\"><div class=\"avatar-halo u-absolute u-textColorGreenNormal svgIcon\" style=\"width: calc(100% + 12px); height: calc(100% + 12px); top:-6px; left:-6px\"><svg viewbox=\"0 0 114 114\" xmlns=\"http://www.w3.org/2000/svg\"><path d=\"M7.66922967,32.092726 C17.0070768,13.6353618 35.9421928,1.75 57,1.75 C78.0578072,1.75 96.9929232,13.6353618 106.33077,32.092726 L107.66923,31.4155801 C98.0784505,12.4582656 78.6289015,0.25 57,0.25 C35.3710985,0.25 15.9215495,12.4582656 6.33077033,31.4155801 L7.66922967,32.092726 Z\"></path><path d=\"M106.33077,81.661427 C96.9929232,100.118791 78.0578072,112.004153 57,112.004153 C35.9421928,112.004153 17.0070768,100.118791 7.66922967,81.661427 L6.33077033,82.338573 C15.9215495,101.295887 35.3710985,113.504153 57,113.504153 C78.6289015,113.504153 98.0784505,101.295887 107.66923,82.338573 L106.33077,81.661427 Z\"></path></svg></div></div></a></div><div class=\"u-tableCell u-verticalAlignMiddle u-breakWord u-paddingLeft15\"><h3 class=\"ui-h3 u-fontSize18 u-lineHeightTighter\"><a class=\"link link--primary u-accentColor--hoverTextNormal\" href=\"https://medium.com/@Medium\" property=\"cc:attributionName\" title=\"Go to the profile of Medium\" aria-label=\"Go to the profile of Medium\" rel=\"author cc:attributionUrl\" data-user-id=\"504c7870fdb6\" dir=\"auto\">Medium</a></h3><div class=\"ui-caption u-textColorGreenNormal u-fontSize13 u-tintSpectrum u-accentColor--textNormal u-marginBottom7\">Medium member since Aug 2017</div><p class=\"ui-body u-fontSize14 u-lineHeightBaseSans u-textColorDark u-marginBottom4\">Everyone’s stories and ideas</p></div></li></div><div class=\"col u-size6of12 u-xs-size12of12 u-xs-marginTop30\"><li class=\"uiScale uiScale-ui--small uiScale-caption--regular u-block u-paddingBottom18 js-cardCollection\"><div class=\"u-marginLeft20 u-floatRight\"></div><div class=\"u-tableCell \"><a class=\"link u-baseColor--link avatar avatar--roundedRectangle\" href=\"https://medium.com/policy?source=footer_card\" title=\"Go to Medium Policy\" aria-label=\"Go to Medium Policy\" data-action-source=\"footer_card\"><img src=\"https://cdn-images-1.medium.com/fit/c/120/120/1*6_fgYnisCa9V21mymySIvA.png\" class=\"avatar-image u-size60x60\" alt=\"Medium Policy\"></a></div><div class=\"u-tableCell u-verticalAlignMiddle u-breakWord u-paddingLeft15\"><h3 class=\"ui-h3 u-fontSize18 u-lineHeightTighter u-marginBottom4\"><a class=\"link link--primary u-accentColor--hoverTextNormal\" href=\"https://medium.com/policy?source=footer_card\" rel=\"collection\" data-action-source=\"footer_card\">Medium Policy</a></h3><p class=\"ui-body u-fontSize14 u-lineHeightBaseSans u-textColorDark u-marginBottom4\">The Fine Print</p><div class=\"buttonSet\"></div></div></li></div></div></div><div class=\"js-postFooterPlacements\"></div><div class=\"u-padding0 u-clearfix u-backgroundGrayLightest u-print-hide supplementalPostContent js-responsesWrapper\"></div><div class=\"supplementalPostContent js-heroPromo\"></div></footer></div>',\n",
" 'domain': 'medium.com',\n",
" 'image_url': None,\n",
" 'link_tags': {'alternate': 'android-app://com.medium.reader/https/medium.com/p/9db0094a1e0f',\n",
" 'apple-touch-icon': 'https://cdn-images-1.medium.com/fit/c/120/120/1*6_fgYnisCa9V21mymySIvA.png',\n",
" 'author': 'https://medium.com/@Medium',\n",
" 'canonical': 'https://medium.com/policy/medium-terms-of-service-9db0094a1e0f',\n",
" 'icon': 'https://cdn-static-1.medium.com/_/fp/icons/favicon-rebrand-medium.3Y6xpZ-0FSdWDnPM3hSBIA.ico',\n",
" 'mask-icon': 'https://cdn-static-1.medium.com/_/fp/icons/monogram-mask.KPLCSFEZviQN0jQ7veN2RQ.svg',\n",
" 'publisher': 'https://plus.google.com/103654360130207659246',\n",
" 'search': '/osd.xml',\n",
" 'stylesheet': 'https://cdn-static-1.medium.com/_/fp/css/main-branding-base.Ch8g7KPCoGXbtKfJaVXo_w.css'},\n",
" 'meta_tags': {'al:android:app_name': 'Medium',\n",
" 'al:android:package': 'com.medium.reader',\n",
" 'al:android:url': 'medium://p/9db0094a1e0f',\n",
" 'al:ios:app_name': 'Medium',\n",
" 'al:ios:app_store_id': '828256236',\n",
" 'al:ios:url': 'medium://p/9db0094a1e0f',\n",
" 'al:web:url': 'https://medium.com/policy/medium-terms-of-service-9db0094a1e0f',\n",
" 'article:author': 'https://medium.com/@Medium',\n",
" 'article:published_time': '2012-08-13T22:54:53.510Z',\n",
" 'article:publisher': 'https://www.facebook.com/medium',\n",
" 'author': 'Medium',\n",
" 'description': 'These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”). By using…',\n",
" 'fb:app_id': '542599432471018',\n",
" 'og:description': 'These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”). By using…',\n",
" 'og:site_name': 'Medium',\n",
" 'og:title': 'Medium Terms of Service – Medium Policy – Medium',\n",
" 'og:type': 'article',\n",
" 'og:url': 'https://medium.com/policy/medium-terms-of-service-9db0094a1e0f',\n",
" 'referrer': 'unsafe-url',\n",
" 'robots': 'index, follow',\n",
" 'theme-color': '#000000',\n",
" 'title': 'Medium Terms of Service – Medium Policy – Medium',\n",
" 'twitter:app:id:iphone': '828256236',\n",
" 'twitter:app:name:iphone': 'Medium',\n",
" 'twitter:app:url:iphone': 'medium://p/9db0094a1e0f',\n",
" 'twitter:card': 'summary',\n",
" 'twitter:creator': '@Medium',\n",
" 'twitter:data1': '5 min read',\n",
" 'twitter:description': 'These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”). By using…',\n",
" 'twitter:label1': 'Reading time',\n",
" 'twitter:site': '@Medium',\n",
" 'viewport': 'width=device-width, initial-scale=1'},\n",
" 'published': {'$date': '2012-08-13T22:54:53.510Z'},\n",
" 'tags': [],\n",
" 'title': 'Medium Terms of Service – Medium Policy – Medium',\n",
" 'url': 'https://medium.com/policy/medium-terms-of-service-9db0094a1e0f'}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jsonofabitch"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2018-03-06T11:57:26.686762Z",
"start_time": "2018-03-06T11:57:26.678762Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['_id', '_timestamp', '_spider', 'url', 'domain', 'published', 'title', 'content', 'author', 'image_url', 'tags', 'link_tags', 'meta_tags'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jsonofabitch.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f = csv.writer(open(\"test.csv\", \"wb+\"))\n",
"\n",
"# Write CSV Header, If you dont need that, remove this line\n",
"f.writerow(['_id', '_timestamp', '_spider', 'url', 'domain', 'published', 'title',\\\n",
" 'content', 'author', 'image_url', 'tags', 'link_tags', 'meta_tags'])\n",
"\n",
"#some have second level also like meta tags\n",
"for x in x:\n",
" f.writerow([x[\"_id\"],\n",
" x[\"_timestamp\"],\n",
" x['_spider'],\n",
" x['_author']\n",
" ])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment