Skip to content

Instantly share code, notes, and snippets.

@reflash
Created February 14, 2019 08:07
Show Gist options
  • Save reflash/f3c2f8933762a454ec25a722777d1fb0 to your computer and use it in GitHub Desktop.
Save reflash/f3c2f8933762a454ec25a722777d1fb0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n",
"Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n",
"Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n",
"Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n",
"Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n",
"Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n",
"Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n",
"Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n",
"Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n",
"Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n",
"Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n",
"Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n",
"Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n",
"Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n",
"Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nltk in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.4)\n",
"Requirement already satisfied: six in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from nltk) (1.12.0)\n",
"Requirement already satisfied: singledispatch in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from nltk) (3.4.0.3)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\Daniil_Ekzarian\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Unzipping tokenizers\\punkt.zip.\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install gensim\n",
"!{sys.executable} -m pip install nltk\n",
"\n",
"import nltk\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models.phrases import Phrases, Phraser\n",
"from nltk import sent_tokenize\n",
"\n",
"text = \"\"\"\n",
"Sleep is important, you know that. Skimping on sleep means you’re courting all kinds of health problems down the line, from heart disease to obesity to diabetes. Still, that doesn’t stop many of us from doing it: More than a third of adults get fewer than seven hours of sleep a night, according to the CDC.\n",
"But what about the opposite side of the equation? Experts typically recommend that adults get seven to nine hours of sleep each night—so is it possible to sleep too much? Less is known on this topic, but it turns out that oversleeping also isn’t good.\n",
"Studies have linked oversleeping with myriad health problems, including a higher risk of death from any cause. In one large meta-analysis published last year in the Journal of the American Heart Association, the risk was shown to increase with each extra hour of sleep: Sleeping nine hours was associated with a 14% increased risk of dying, 10 hours with a 30% higher risk, and 11 hours with a 47% higher risk. The risk of dying from heart disease and stroke also increased with longer sleep times.\n",
"\"\"\"\n",
"\n",
"sentences = sent_tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['\\nSleep is important, you know that.',\n",
" 'Skimping on sleep means you’re courting all kinds of health problems down the line, from heart disease to obesity to diabetes.',\n",
" 'Still, that doesn’t stop many of us from doing it: More than a third of adults get fewer than seven hours of sleep a night, according to the CDC.',\n",
" 'But what about the opposite side of the equation?',\n",
" 'Experts typically recommend that adults get seven to nine hours of sleep each night—so is it possible to sleep too much?',\n",
" 'Less is known on this topic, but it turns out that oversleeping also isn’t good.',\n",
" 'Studies have linked oversleeping with myriad health problems, including a higher risk of death from any cause.',\n",
" 'In one large meta-analysis published last year in the Journal of the American Heart Association, the risk was shown to increase with each extra hour of sleep: Sleeping nine hours was associated with a 14% increased risk of dying, 10 hours with a 30% higher risk, and 11 hours with a 47% higher risk.',\n",
" 'The risk of dying from heart disease and stroke also increased with longer sleep times.']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['\\nSleep',\n",
" 'is',\n",
" 'important,',\n",
" 'you',\n",
" 'know',\n",
" 'that.',\n",
" 'Skimping',\n",
" 'on',\n",
" 'sleep',\n",
" 'means',\n",
" 'you’re',\n",
" 'courting',\n",
" 'all',\n",
" 'kinds',\n",
" 'of',\n",
" 'health',\n",
" 'problems',\n",
" 'down',\n",
" 'the',\n",
" 'line,',\n",
" 'from',\n",
" 'heart',\n",
" 'disease',\n",
" 'to',\n",
" 'obesity',\n",
" 'to',\n",
" 'diabetes.',\n",
" 'Still,',\n",
" 'that',\n",
" 'doesn’t',\n",
" 'stop',\n",
" 'many',\n",
" 'of',\n",
" 'us',\n",
" 'from',\n",
" 'doing',\n",
" 'it:',\n",
" 'More',\n",
" 'than',\n",
" 'a',\n",
" 'third',\n",
" 'of',\n",
" 'adults',\n",
" 'get',\n",
" 'fewer',\n",
" 'than',\n",
" 'seven',\n",
" 'hours',\n",
" 'of',\n",
" 'sleep',\n",
" 'a',\n",
" 'night,',\n",
" 'according',\n",
" 'to',\n",
" 'the',\n",
" 'CDC.',\n",
" 'But',\n",
" 'what',\n",
" 'about',\n",
" 'the',\n",
" 'opposite',\n",
" 'side',\n",
" 'of',\n",
" 'the',\n",
" 'equation?',\n",
" 'Experts',\n",
" 'typically',\n",
" 'recommend',\n",
" 'that',\n",
" 'adults',\n",
" 'get',\n",
" 'seven',\n",
" 'to',\n",
" 'nine',\n",
" 'hours',\n",
" 'of',\n",
" 'sleep',\n",
" 'each',\n",
" 'night—so',\n",
" 'is',\n",
" 'it',\n",
" 'possible',\n",
" 'to',\n",
" 'sleep',\n",
" 'too',\n",
" 'much?',\n",
" 'Less',\n",
" 'is',\n",
" 'known',\n",
" 'on',\n",
" 'this',\n",
" 'topic,',\n",
" 'but',\n",
" 'it',\n",
" 'turns',\n",
" 'out',\n",
" 'that',\n",
" 'oversleeping',\n",
" 'also',\n",
" 'isn’t',\n",
" 'good.',\n",
" 'Studies',\n",
" 'have',\n",
" 'linked',\n",
" 'oversleeping',\n",
" 'with',\n",
" 'myriad',\n",
" 'health',\n",
" 'problems,',\n",
" 'including',\n",
" 'a',\n",
" 'higher',\n",
" 'risk of',\n",
" 'death',\n",
" 'from',\n",
" 'any',\n",
" 'cause.',\n",
" 'In',\n",
" 'one',\n",
" 'large',\n",
" 'meta-analysis',\n",
" 'published',\n",
" 'last',\n",
" 'year',\n",
" 'in',\n",
" 'the',\n",
" 'Journal',\n",
" 'of',\n",
" 'the',\n",
" 'American',\n",
" 'Heart',\n",
" 'Association,',\n",
" 'the',\n",
" 'risk',\n",
" 'was',\n",
" 'shown',\n",
" 'to',\n",
" 'increase',\n",
" 'with',\n",
" 'each',\n",
" 'extra',\n",
" 'hour',\n",
" 'of',\n",
" 'sleep:',\n",
" 'Sleeping',\n",
" 'nine',\n",
" 'hours',\n",
" 'was',\n",
" 'associated',\n",
" 'with a',\n",
" '14%',\n",
" 'increased',\n",
" 'risk of',\n",
" 'dying,',\n",
" '10',\n",
" 'hours',\n",
" 'with a',\n",
" '30%',\n",
" 'higher',\n",
" 'risk,',\n",
" 'and',\n",
" '11',\n",
" 'hours',\n",
" 'with a',\n",
" '47%',\n",
" 'higher',\n",
" 'risk.',\n",
" 'The',\n",
" 'risk of',\n",
" 'dying',\n",
" 'from',\n",
" 'heart',\n",
" 'disease',\n",
" 'and',\n",
" 'stroke',\n",
" 'also',\n",
" 'increased',\n",
" 'with',\n",
" 'longer',\n",
" 'sleep',\n",
" 'times.']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentence_stream = [s.split(\" \") for s in sentences]\n",
"\n",
"bigram = Phrases(sentence_stream, min_count=2, threshold=2, delimiter=b' ')\n",
"bigram_phraser = Phraser(bigram)\n",
"tokens_ = bigram_phraser[sentence_stream]\n",
"trigram = Phrases(tokens_, min_count=2, threshold=2, delimiter=b' ')\n",
"trigram_phraser = Phraser(trigram)\n",
"tokens__ = trigram_phraser[tokens_]\n",
"all_words = [i for j in tokens__ for i in j]\n",
"all_words"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment