Skip to content

Instantly share code, notes, and snippets.

@davidmcclure
Created July 13, 2018 13:51
Show Gist options
  • Save davidmcclure/096111edc00090bbda307e9e5fc3bb8a to your computer and use it in GitHub Desktop.
Save davidmcclure/096111edc00090bbda307e9e5fc3bb8a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import ujson\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from scipy.spatial import distance\n",
"from tqdm import tqdm\n",
"\n",
"from geovec.dce_db import Token, connect_db"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"mpl.style.use('seaborn-muted')\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"engine, session = connect_db('/data/6m-2k.db')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def random_split_ds(embeds1, embeds2, n=100):\n",
" \n",
" embeds = np.append(embeds1, embeds2, 0)\n",
" \n",
" ds = []\n",
" for _ in range(n):\n",
" idxs = list(range(len(embeds1)+len(embeds2)))\n",
" random.shuffle(idxs)\n",
" g1 = embeds[idxs[:len(embeds1)]]\n",
" g2 = embeds[idxs[len(embeds1):]]\n",
" ds.append(distance.cosine(g1.mean(0), g2.mean(0)))\n",
" \n",
" return ds"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"with open('/home/ubuntu/checkpoints/dce/top1k.json') as fh:\n",
" counts = ujson.load(fh)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def sample_token(token):\n",
" \n",
" la_rows = session.query(Token).filter(Token.token==token).filter(Token.group=='LA').limit(1000)\n",
" sf_rows = session.query(Token).filter(Token.token==token).filter(Token.group=='SF').limit(1000)\n",
" \n",
" la_embeds = np.stack(np.frombuffer(r.embedding, dtype='float32') for r in la_rows)\n",
" sf_embeds = np.stack(np.frombuffer(r.embedding, dtype='float32') for r in sf_rows)\n",
" \n",
" d = distance.cosine(sf_embeds.mean(0), la_embeds.mean(0))\n",
" \n",
" ds = random_split_ds(la_embeds, sf_embeds)\n",
" \n",
" return d, ds"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"session.rollback()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 .\n",
"1 the\n",
"2 i\n",
"3 !\n",
"4 to\n",
"5 ,\n",
"6 a\n",
"7 you\n",
"8 and\n",
"9 of\n",
"10 is\n",
"11 in\n",
"12 for\n",
"13 ?\n",
"14 it\n",
"15 my\n",
"16 this\n",
"17 on\n",
"18 ’\n",
"19 's\n",
"20 that\n",
"21 :\n",
"22 ...\n",
"23 me\n",
"24 with\n",
"25 your\n",
"26 at\n",
"27 n't\n",
"28 so\n",
"29 be\n",
"30 do\n",
"31 are\n",
"32 we\n",
"33 have\n",
"34 ;\n",
"35 just\n",
"36 ``\n",
"37 but\n",
"38 &\n",
"39 all\n",
"40 like\n",
"41 ''\n",
"42 was\n",
"43 not\n",
"44 -\n",
"45 what\n",
"46 out\n",
"47 )\n",
"48 can\n",
"49 from\n",
"50 'm\n",
"51 amp\n",
"52 up\n",
"53 (\n",
"54 if\n",
"55 get\n",
"56 how\n",
"57 s\n",
"58 about\n",
"59 when\n",
"60 love\n",
"61 by\n",
"62 no\n",
"63 one\n",
"64 new\n",
"65 they\n",
"66 as\n",
"67 he\n",
"68 more\n",
"69 our\n",
"70 now\n",
"71 an\n",
"72 will\n",
"73 time\n",
"74 day\n",
"75 t\n",
"76 know\n",
"77 who\n",
"78 good\n",
"79 or\n",
"80 people\n",
"81 got\n",
"82 today\n",
"83 see\n",
"84 there\n",
"85 go\n",
"86 via\n",
"87 here\n",
"88 're\n",
"89 de\n",
"90 some\n",
"91 why\n",
"92 has\n",
"93 video\n",
"94 na\n",
"95 need\n",
"96 want\n",
"97 la\n",
"98 did\n",
"99 would\n",
"100 been\n",
"101 @\n",
"102 back\n",
"103 us\n",
"104 great\n",
"105 “\n",
"106 make\n",
"107 really\n",
"108 his\n",
"109 ca\n",
"110 think\n",
"111 she\n",
"112 m\n",
"113 que\n",
"114 too\n",
"115 happy\n",
"116 going\n",
"117 best\n",
"118 u\n",
"119 much\n",
"120 $\n",
"121 only\n",
"122 her\n",
"123 right\n",
"124 ”\n",
"125 life\n",
"126 thank\n",
"127 still\n",
"128 had\n",
"129 lol\n",
"130 work\n",
"131 let\n",
"132 thanks\n",
"133 first\n",
"134 should\n",
"135 never\n",
"136 these\n",
"137 them\n",
"138 their\n",
"139 …\n",
"140 am\n",
"141 last\n",
"142 've\n",
"143 2\n",
"144 way\n",
"145 year\n",
"146 over\n",
"147 than\n",
"148 come\n",
"149 off\n",
"150 even\n",
"151 take\n",
"152 always\n",
"153 does\n",
"154 '\n",
"155 because\n",
"156 check\n",
"157 look\n",
"158 show\n",
"159 please\n",
"160 night\n",
"161 shit\n",
"162 could\n",
"163 'll\n",
"164 ever\n",
"165 say\n",
"166 being\n",
"167 trump\n",
"168 man\n",
"169 feel\n",
"170 where\n",
"171 after\n",
"172 into\n",
"173 next\n",
"174 3\n",
"175 then\n",
"176 him\n",
"177 live\n",
"178 better\n",
"179 y\n",
"180 week\n",
"181 someone\n",
"182 every\n",
"183 down\n",
"184 were\n",
"185 don\n",
"186 😂\n",
"187 world\n",
"188 help\n",
"189 tonight\n",
"190 well\n",
"191 most\n",
"192 made\n",
"193 watch\n",
"194 everyone\n",
"195 getting\n",
"196 any\n",
"197 thing\n",
"198 |\n",
"199 game\n",
"200 also\n",
"201 real\n",
"202 fuck\n",
"203 los\n",
"204 gon\n",
"205 hope\n",
"206 very\n",
"207 things\n",
"208 wait\n",
"209 keep\n",
"210 again\n",
"211 home\n",
"212 oh\n",
"213 stop\n",
"214 re\n",
"215 little\n",
"216 amazing\n",
"217 other\n",
"218 something\n",
"219 music\n",
"220 free\n",
"221 friends\n",
"222 many\n",
"223 big\n",
"224 1\n",
"225 yes\n",
"226 years\n",
"227 before\n",
"228 guys\n",
"229 its\n",
"230 same\n",
"231 en\n",
"232 im\n",
"233 looking\n",
"234 give\n",
"235 sure\n",
"236 bad\n",
"237 %\n",
"238 girl\n",
"239 find\n",
"240 those\n",
"241 doing\n",
"242 2017\n",
"243 said\n",
"244 5\n",
"245 two\n",
"246 el\n",
"247 days\n",
"248 tell\n",
"249 hey\n",
"250 may\n",
"251 another\n",
"252 fun\n",
"253 miss\n",
"254 use\n",
"255 4\n",
"256 coming\n",
"257 favorite\n",
"258 long\n",
"259 beautiful\n",
"260 gt\n",
"261 liked\n",
"262 while\n",
"263 o\n",
"264 read\n",
"265 birthday\n",
"266 start\n",
"267 god\n",
"268 fucking\n",
"269 tomorrow\n",
"270 twitter\n",
"271 play\n",
"272 ya\n",
"273 already\n",
"274 old\n",
"275 trying\n",
"276 team\n",
"277 hard\n",
"278 ass\n",
"279 follow\n",
"280 wan\n",
"281 everything\n",
"282 ready\n",
"283 through\n",
"284 talk\n",
"285 hate\n",
"286 anyone\n",
"287 added\n",
"288 thought\n",
"289 --\n",
"290 weekend\n",
"291 women\n",
"292 part\n",
"293 making\n",
"294 10\n",
"295 watching\n",
"296 friend\n",
"297 morning\n",
"298 e\n",
"299 makes\n",
"300 call\n",
"301 own\n",
"302 excited\n",
"303 actually\n",
"304 around\n",
"305 ve\n",
"306 such\n",
"307 school\n",
"308 put\n",
"309 song\n",
"310 having\n",
"311 win\n",
"312 lot\n",
"313 try\n",
"314 news\n",
"315 white\n",
"316 money\n",
"317 house\n",
"318 done\n",
"319 baby\n",
"320 nothing\n",
"321 join\n",
"322 top\n",
"323 california\n",
"324 support\n",
"325 'd\n",
"326 black\n",
"327 guy\n",
"328 angeles\n",
"329 pretty\n",
"330 un\n",
"331 looks\n",
"332 person\n",
"333 wow\n",
"334 family\n",
"335 nice\n",
"336 which\n",
"337 since\n",
"338 soon\n",
"339 away\n",
"340 lmao\n",
"341 playlist\n",
"342 san\n",
"343 se\n",
"344 yeah\n",
"345 myself\n",
"346 damn\n",
"347 wish\n",
"348 —\n",
"349 working\n",
"350 season\n",
"351 mean\n",
"352 ta\n",
"353 believe\n",
"354 yo\n",
"355 cool\n",
"356 end\n",
"357 awesome\n",
"358 omg\n",
"359 finally\n",
"360 post\n",
"361 stay\n",
"362 yet\n",
"363 full\n",
"364 movie\n",
"365 hear\n",
"366 ..\n",
"367 job\n",
"368 care\n",
"369 heart\n",
"370 sorry\n",
"371 times\n",
"372 might\n",
"373 story\n",
"374 true\n",
"375 listen\n",
"376 mom\n",
"377 tweet\n",
"378 change\n",
"379 maybe\n",
"380 anything\n",
"381 te\n",
"382 +\n",
"383 d\n",
"384 name\n",
"385 place\n",
"386 w/\n",
"387 without\n",
"388 ❤️\n",
"389 yourself\n",
"390 high\n",
"391 bitch\n",
"392 2018\n",
"393 until\n",
"394 left\n",
"395 few\n",
"396 party\n",
"397 remember\n",
"398 learn\n",
"399 both\n",
"400 ur\n",
"401 mi\n",
"402 [\n",
"403 eu\n",
"404 seen\n",
"405 es\n",
"406 whole\n",
"407 city\n",
"408 playing\n",
"409 super\n",
"410 face\n",
"411 says\n",
"412 book\n",
"413 must\n",
"414 business\n",
"415 saw\n",
"416 buy\n",
"417 literally\n",
"418 enough\n",
"419 sad\n",
"420 open\n",
"421 talking\n",
"422 por\n",
"423 w\n",
"424 /\n",
"425 food\n",
"426 sleep\n",
"427 feeling\n",
"428 set\n",
"429 wrong\n",
"430 episode\n",
"431 went\n",
"432 proud\n",
"433 friday\n",
"434 ok\n",
"435 though\n",
"436 ll\n",
"437 ai\n",
"438 president\n",
"439 n\n",
"440 con\n",
"441 phone\n",
"442 photo\n",
"443 ]\n",
"444 taking\n",
"445 hot\n",
"446 vote\n",
"447 hi\n",
"448 para\n",
"449 kids\n",
"450 found\n",
"451 cause\n",
"452 needs\n",
"453 mind\n",
"454 used\n",
"455 cute\n",
"456 social\n",
"457 hit\n",
"458 point\n",
"459 class\n",
"460 ‘\n",
"461 lo\n",
"462 lt\n",
"463 crazy\n",
"464 hours\n",
"465 rt\n",
"466 future\n",
"467 using\n",
"468 else\n",
"469 ask\n",
"470 haha\n",
"471 together\n",
"472 meet\n"
]
}
],
"source": [
"rows = []\n",
"for i, (t, c) in enumerate(counts[:500]):\n",
" d, ds = sample_token(t)\n",
" z = abs(d - np.mean(ds)) / np.std(ds)\n",
" rows.append((t, c, d, ds, z))\n",
" print(i, t)\n",
" \n",
"df = pd.DataFrame(rows, columns=('token', 'count', 'd_exp', 'ds', 'z'))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>token</th>\n",
" <th>count</th>\n",
" <th>d_exp</th>\n",
" <th>ds</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>203</th>\n",
" <td>los</td>\n",
" <td>7746</td>\n",
" <td>0.112331</td>\n",
" <td>[0.00135790740553, 0.000963894324147, 0.001269...</td>\n",
" <td>232.762017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>437</th>\n",
" <td>ai</td>\n",
" <td>3352</td>\n",
" <td>0.058637</td>\n",
" <td>[0.00188516379617, 0.00139467376053, 0.0015749...</td>\n",
" <td>161.517873</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>—</td>\n",
" <td>4329</td>\n",
" <td>0.070315</td>\n",
" <td>[0.0051891477794, 0.00339512870429, 0.00291878...</td>\n",
" <td>91.550501</td>\n",
" </tr>\n",
" <tr>\n",
" <th>284</th>\n",
" <td>talk</td>\n",
" <td>5340</td>\n",
" <td>0.019648</td>\n",
" <td>[0.0011781055589, 0.00149699740841, 0.00138911...</td>\n",
" <td>90.328862</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>la</td>\n",
" <td>17385</td>\n",
" <td>0.032928</td>\n",
" <td>[0.000923795118494, 0.000985264309624, 0.00162...</td>\n",
" <td>85.569749</td>\n",
" </tr>\n",
" <tr>\n",
" <th>323</th>\n",
" <td>california</td>\n",
" <td>4747</td>\n",
" <td>0.027580</td>\n",
" <td>[0.00178457186927, 0.0017313196533, 0.00180051...</td>\n",
" <td>81.001580</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229</th>\n",
" <td>its</td>\n",
" <td>6940</td>\n",
" <td>0.023652</td>\n",
" <td>[0.00159505279716, 0.00141928954964, 0.0013946...</td>\n",
" <td>77.599160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>289</th>\n",
" <td>--</td>\n",
" <td>5152</td>\n",
" <td>0.040916</td>\n",
" <td>[0.00343410841811, 0.00226424170883, 0.0017492...</td>\n",
" <td>75.712834</td>\n",
" </tr>\n",
" <tr>\n",
" <th>328</th>\n",
" <td>angeles</td>\n",
" <td>4662</td>\n",
" <td>0.021476</td>\n",
" <td>[0.00119217160129, 0.00188189526946, 0.0011119...</td>\n",
" <td>66.917553</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432</th>\n",
" <td>proud</td>\n",
" <td>3388</td>\n",
" <td>0.013832</td>\n",
" <td>[0.00122885878954, 0.00113846442952, 0.0015184...</td>\n",
" <td>62.332236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>$</td>\n",
" <td>14568</td>\n",
" <td>0.012642</td>\n",
" <td>[0.000890424457262, 0.000949469690017, 0.00106...</td>\n",
" <td>60.034818</td>\n",
" </tr>\n",
" <tr>\n",
" <th>466</th>\n",
" <td>future</td>\n",
" <td>3147</td>\n",
" <td>0.026197</td>\n",
" <td>[0.00282667512576, 0.00348593916129, 0.0032119...</td>\n",
" <td>53.824493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>305</th>\n",
" <td>ve</td>\n",
" <td>4936</td>\n",
" <td>0.024768</td>\n",
" <td>[0.000858097256798, 0.00147942598951, 0.001316...</td>\n",
" <td>51.242633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>via</td>\n",
" <td>20589</td>\n",
" <td>0.013783</td>\n",
" <td>[0.00112213406955, 0.00127279928217, 0.0012672...</td>\n",
" <td>48.546740</td>\n",
" </tr>\n",
" <tr>\n",
" <th>302</th>\n",
" <td>excited</td>\n",
" <td>4982</td>\n",
" <td>0.009982</td>\n",
" <td>[0.000896301546452, 0.000960477868357, 0.00116...</td>\n",
" <td>47.222492</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>more</td>\n",
" <td>27929</td>\n",
" <td>0.016556</td>\n",
" <td>[0.00221123896012, 0.00173908846677, 0.0017509...</td>\n",
" <td>46.883646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>382</th>\n",
" <td>+</td>\n",
" <td>3869</td>\n",
" <td>0.023103</td>\n",
" <td>[0.00391646925771, 0.00408116596851, 0.0032337...</td>\n",
" <td>46.831773</td>\n",
" </tr>\n",
" <tr>\n",
" <th>368</th>\n",
" <td>care</td>\n",
" <td>3978</td>\n",
" <td>0.024201</td>\n",
" <td>[0.00230850653227, 0.00216674519179, 0.0018881...</td>\n",
" <td>46.454507</td>\n",
" </tr>\n",
" <tr>\n",
" <th>479</th>\n",
" <td>car</td>\n",
" <td>2997</td>\n",
" <td>0.023274</td>\n",
" <td>[0.00344226087379, 0.00359079070889, 0.0028275...</td>\n",
" <td>45.386786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>ya</td>\n",
" <td>5592</td>\n",
" <td>0.026968</td>\n",
" <td>[0.00123320930269, 0.00195264772387, 0.0016200...</td>\n",
" <td>44.320064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>414</th>\n",
" <td>business</td>\n",
" <td>3556</td>\n",
" <td>0.021823</td>\n",
" <td>[0.00274174598968, 0.00202854888223, 0.0025652...</td>\n",
" <td>44.105368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>360</th>\n",
" <td>post</td>\n",
" <td>4157</td>\n",
" <td>0.023613</td>\n",
" <td>[0.00363341161293, 0.00237725307378, 0.0050365...</td>\n",
" <td>41.064434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>386</th>\n",
" <td>w/</td>\n",
" <td>3799</td>\n",
" <td>0.023760</td>\n",
" <td>[0.00305038649583, 0.00318733115633, 0.0032333...</td>\n",
" <td>40.946026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>398</th>\n",
" <td>learn</td>\n",
" <td>3692</td>\n",
" <td>0.008013</td>\n",
" <td>[0.000890397809368, 0.00121039501476, 0.001066...</td>\n",
" <td>40.211879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>378</th>\n",
" <td>change</td>\n",
" <td>3877</td>\n",
" <td>0.020636</td>\n",
" <td>[0.00226427046406, 0.00314452526289, 0.0032838...</td>\n",
" <td>39.398374</td>\n",
" </tr>\n",
" <tr>\n",
" <th>365</th>\n",
" <td>hear</td>\n",
" <td>4044</td>\n",
" <td>0.013148</td>\n",
" <td>[0.00190486806428, 0.00139493049882, 0.0016828...</td>\n",
" <td>39.162212</td>\n",
" </tr>\n",
" <tr>\n",
" <th>452</th>\n",
" <td>needs</td>\n",
" <td>3263</td>\n",
" <td>0.009131</td>\n",
" <td>[0.00155186355075, 0.00118229797052, 0.0013700...</td>\n",
" <td>37.880544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>472</th>\n",
" <td>meet</td>\n",
" <td>3086</td>\n",
" <td>0.012479</td>\n",
" <td>[0.00174712715112, 0.00147014494077, 0.0023726...</td>\n",
" <td>37.784471</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>amp</td>\n",
" <td>36555</td>\n",
" <td>0.003584</td>\n",
" <td>[0.000445869628667, 0.000526211508072, 0.00043...</td>\n",
" <td>37.729830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>456</th>\n",
" <td>social</td>\n",
" <td>3218</td>\n",
" <td>0.011935</td>\n",
" <td>[0.00136758445685, 0.00142817423645, 0.0013712...</td>\n",
" <td>37.646990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>298</th>\n",
" <td>e</td>\n",
" <td>5014</td>\n",
" <td>0.002131</td>\n",
" <td>[0.00071127838677, 0.00111997462122, 0.0011171...</td>\n",
" <td>2.765757</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>hey</td>\n",
" <td>6192</td>\n",
" <td>0.000834</td>\n",
" <td>[0.000980674719306, 0.000616826034544, 0.00052...</td>\n",
" <td>2.509196</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>god</td>\n",
" <td>5689</td>\n",
" <td>0.002806</td>\n",
" <td>[0.00186721658526, 0.00251852692234, 0.0018583...</td>\n",
" <td>2.507279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>439</th>\n",
" <td>n</td>\n",
" <td>3325</td>\n",
" <td>0.006086</td>\n",
" <td>[0.00293995593618, 0.00318388799321, 0.0028064...</td>\n",
" <td>2.487983</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>ass</td>\n",
" <td>5473</td>\n",
" <td>0.003146</td>\n",
" <td>[0.00267454142488, 0.00239882163089, 0.0023214...</td>\n",
" <td>2.473642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>202</th>\n",
" <td>fuck</td>\n",
" <td>7750</td>\n",
" <td>0.002836</td>\n",
" <td>[0.00223126022251, 0.00166447906941, 0.0014980...</td>\n",
" <td>2.383827</td>\n",
" </tr>\n",
" <tr>\n",
" <th>246</th>\n",
" <td>el</td>\n",
" <td>6329</td>\n",
" <td>0.000814</td>\n",
" <td>[0.000446732468665, 0.000356268750521, 0.00045...</td>\n",
" <td>2.349352</td>\n",
" </tr>\n",
" <tr>\n",
" <th>397</th>\n",
" <td>remember</td>\n",
" <td>3706</td>\n",
" <td>0.001821</td>\n",
" <td>[0.00118286005505, 0.00136679458645, 0.0013184...</td>\n",
" <td>2.300288</td>\n",
" </tr>\n",
" <tr>\n",
" <th>315</th>\n",
" <td>white</td>\n",
" <td>4823</td>\n",
" <td>0.001986</td>\n",
" <td>[0.00115464921491, 0.00125369026226, 0.0014720...</td>\n",
" <td>2.202182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>no</td>\n",
" <td>31873</td>\n",
" <td>0.002684</td>\n",
" <td>[0.00173625799333, 0.0024176797299, 0.00162166...</td>\n",
" <td>2.146976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>im</td>\n",
" <td>6779</td>\n",
" <td>0.001357</td>\n",
" <td>[0.000913378285724, 0.00110274906009, 0.001028...</td>\n",
" <td>2.058491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>my</td>\n",
" <td>115024</td>\n",
" <td>0.002057</td>\n",
" <td>[0.00158776491079, 0.00184817151368, 0.0015637...</td>\n",
" <td>1.700647</td>\n",
" </tr>\n",
" <tr>\n",
" <th>204</th>\n",
" <td>gon</td>\n",
" <td>7641</td>\n",
" <td>0.000688</td>\n",
" <td>[0.001087271043, 0.000919055750999, 0.00079605...</td>\n",
" <td>1.587030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>346</th>\n",
" <td>damn</td>\n",
" <td>4347</td>\n",
" <td>0.002673</td>\n",
" <td>[0.00205400489143, 0.00160303178002, 0.0029709...</td>\n",
" <td>1.479652</td>\n",
" </tr>\n",
" <tr>\n",
" <th>438</th>\n",
" <td>president</td>\n",
" <td>3345</td>\n",
" <td>0.002480</td>\n",
" <td>[0.00185247990463, 0.00205992028678, 0.0022074...</td>\n",
" <td>1.439927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>u</td>\n",
" <td>14663</td>\n",
" <td>0.003613</td>\n",
" <td>[0.00424093655551, 0.0028234087273, 0.00278868...</td>\n",
" <td>1.337101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>475</th>\n",
" <td>dude</td>\n",
" <td>3055</td>\n",
" <td>0.002950</td>\n",
" <td>[0.00268938238552, 0.00325738481028, 0.0020650...</td>\n",
" <td>1.297590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>ur</td>\n",
" <td>3688</td>\n",
" <td>0.002625</td>\n",
" <td>[0.00290464740444, 0.00239695635683, 0.0023634...</td>\n",
" <td>1.265491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>341</th>\n",
" <td>playlist</td>\n",
" <td>4440</td>\n",
" <td>0.002608</td>\n",
" <td>[0.0021429648858, 0.00150084378112, 0.00210966...</td>\n",
" <td>1.211341</td>\n",
" </tr>\n",
" <tr>\n",
" <th>364</th>\n",
" <td>movie</td>\n",
" <td>4064</td>\n",
" <td>0.003761</td>\n",
" <td>[0.00306334928857, 0.00353916956988, 0.0029908...</td>\n",
" <td>1.133617</td>\n",
" </tr>\n",
" <tr>\n",
" <th>391</th>\n",
" <td>bitch</td>\n",
" <td>3774</td>\n",
" <td>0.002331</td>\n",
" <td>[0.00171471173769, 0.00196388744309, 0.0021903...</td>\n",
" <td>1.078767</td>\n",
" </tr>\n",
" <tr>\n",
" <th>161</th>\n",
" <td>shit</td>\n",
" <td>10246</td>\n",
" <td>0.002562</td>\n",
" <td>[0.00176936684669, 0.00235171025843, 0.0022017...</td>\n",
" <td>1.027394</td>\n",
" </tr>\n",
" <tr>\n",
" <th>442</th>\n",
" <td>photo</td>\n",
" <td>3312</td>\n",
" <td>0.002929</td>\n",
" <td>[0.00364762890397, 0.00197530806939, 0.0033352...</td>\n",
" <td>1.015559</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238</th>\n",
" <td>girl</td>\n",
" <td>6536</td>\n",
" <td>0.003030</td>\n",
" <td>[0.00226944374873, 0.00245019617065, 0.0025590...</td>\n",
" <td>0.502013</td>\n",
" </tr>\n",
" <tr>\n",
" <th>376</th>\n",
" <td>mom</td>\n",
" <td>3919</td>\n",
" <td>0.002492</td>\n",
" <td>[0.00221931648228, 0.00251558123951, 0.0021144...</td>\n",
" <td>0.459322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>498</th>\n",
" <td>y'all</td>\n",
" <td>2893</td>\n",
" <td>0.002356</td>\n",
" <td>[0.00186848643965, 0.0022975931405, 0.00218380...</td>\n",
" <td>0.351725</td>\n",
" </tr>\n",
" <tr>\n",
" <th>405</th>\n",
" <td>es</td>\n",
" <td>3634</td>\n",
" <td>0.000490</td>\n",
" <td>[0.000463265577439, 0.000533759973911, 0.00042...</td>\n",
" <td>0.349650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>340</th>\n",
" <td>lmao</td>\n",
" <td>4482</td>\n",
" <td>0.001795</td>\n",
" <td>[0.00170374075911, 0.00134416100933, 0.0017613...</td>\n",
" <td>0.192227</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343</th>\n",
" <td>se</td>\n",
" <td>4401</td>\n",
" <td>0.000729</td>\n",
" <td>[0.000995545457114, 0.00057913246497, 0.001377...</td>\n",
" <td>0.175775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>440</th>\n",
" <td>con</td>\n",
" <td>3322</td>\n",
" <td>0.000595</td>\n",
" <td>[0.00112114873198, 0.000513214087504, 0.000618...</td>\n",
" <td>0.067421</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>500 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" token count d_exp \\\n",
"203 los 7746 0.112331 \n",
"437 ai 3352 0.058637 \n",
"348 — 4329 0.070315 \n",
"284 talk 5340 0.019648 \n",
"97 la 17385 0.032928 \n",
"323 california 4747 0.027580 \n",
"229 its 6940 0.023652 \n",
"289 -- 5152 0.040916 \n",
"328 angeles 4662 0.021476 \n",
"432 proud 3388 0.013832 \n",
"120 $ 14568 0.012642 \n",
"466 future 3147 0.026197 \n",
"305 ve 4936 0.024768 \n",
"86 via 20589 0.013783 \n",
"302 excited 4982 0.009982 \n",
"68 more 27929 0.016556 \n",
"382 + 3869 0.023103 \n",
"368 care 3978 0.024201 \n",
"479 car 2997 0.023274 \n",
"272 ya 5592 0.026968 \n",
"414 business 3556 0.021823 \n",
"360 post 4157 0.023613 \n",
"386 w/ 3799 0.023760 \n",
"398 learn 3692 0.008013 \n",
"378 change 3877 0.020636 \n",
"365 hear 4044 0.013148 \n",
"452 needs 3263 0.009131 \n",
"472 meet 3086 0.012479 \n",
"51 amp 36555 0.003584 \n",
"456 social 3218 0.011935 \n",
".. ... ... ... \n",
"298 e 5014 0.002131 \n",
"249 hey 6192 0.000834 \n",
"267 god 5689 0.002806 \n",
"439 n 3325 0.006086 \n",
"278 ass 5473 0.003146 \n",
"202 fuck 7750 0.002836 \n",
"246 el 6329 0.000814 \n",
"397 remember 3706 0.001821 \n",
"315 white 4823 0.001986 \n",
"62 no 31873 0.002684 \n",
"232 im 6779 0.001357 \n",
"15 my 115024 0.002057 \n",
"204 gon 7641 0.000688 \n",
"346 damn 4347 0.002673 \n",
"438 president 3345 0.002480 \n",
"118 u 14663 0.003613 \n",
"475 dude 3055 0.002950 \n",
"400 ur 3688 0.002625 \n",
"341 playlist 4440 0.002608 \n",
"364 movie 4064 0.003761 \n",
"391 bitch 3774 0.002331 \n",
"161 shit 10246 0.002562 \n",
"442 photo 3312 0.002929 \n",
"238 girl 6536 0.003030 \n",
"376 mom 3919 0.002492 \n",
"498 y'all 2893 0.002356 \n",
"405 es 3634 0.000490 \n",
"340 lmao 4482 0.001795 \n",
"343 se 4401 0.000729 \n",
"440 con 3322 0.000595 \n",
"\n",
" ds z \n",
"203 [0.00135790740553, 0.000963894324147, 0.001269... 232.762017 \n",
"437 [0.00188516379617, 0.00139467376053, 0.0015749... 161.517873 \n",
"348 [0.0051891477794, 0.00339512870429, 0.00291878... 91.550501 \n",
"284 [0.0011781055589, 0.00149699740841, 0.00138911... 90.328862 \n",
"97 [0.000923795118494, 0.000985264309624, 0.00162... 85.569749 \n",
"323 [0.00178457186927, 0.0017313196533, 0.00180051... 81.001580 \n",
"229 [0.00159505279716, 0.00141928954964, 0.0013946... 77.599160 \n",
"289 [0.00343410841811, 0.00226424170883, 0.0017492... 75.712834 \n",
"328 [0.00119217160129, 0.00188189526946, 0.0011119... 66.917553 \n",
"432 [0.00122885878954, 0.00113846442952, 0.0015184... 62.332236 \n",
"120 [0.000890424457262, 0.000949469690017, 0.00106... 60.034818 \n",
"466 [0.00282667512576, 0.00348593916129, 0.0032119... 53.824493 \n",
"305 [0.000858097256798, 0.00147942598951, 0.001316... 51.242633 \n",
"86 [0.00112213406955, 0.00127279928217, 0.0012672... 48.546740 \n",
"302 [0.000896301546452, 0.000960477868357, 0.00116... 47.222492 \n",
"68 [0.00221123896012, 0.00173908846677, 0.0017509... 46.883646 \n",
"382 [0.00391646925771, 0.00408116596851, 0.0032337... 46.831773 \n",
"368 [0.00230850653227, 0.00216674519179, 0.0018881... 46.454507 \n",
"479 [0.00344226087379, 0.00359079070889, 0.0028275... 45.386786 \n",
"272 [0.00123320930269, 0.00195264772387, 0.0016200... 44.320064 \n",
"414 [0.00274174598968, 0.00202854888223, 0.0025652... 44.105368 \n",
"360 [0.00363341161293, 0.00237725307378, 0.0050365... 41.064434 \n",
"386 [0.00305038649583, 0.00318733115633, 0.0032333... 40.946026 \n",
"398 [0.000890397809368, 0.00121039501476, 0.001066... 40.211879 \n",
"378 [0.00226427046406, 0.00314452526289, 0.0032838... 39.398374 \n",
"365 [0.00190486806428, 0.00139493049882, 0.0016828... 39.162212 \n",
"452 [0.00155186355075, 0.00118229797052, 0.0013700... 37.880544 \n",
"472 [0.00174712715112, 0.00147014494077, 0.0023726... 37.784471 \n",
"51 [0.000445869628667, 0.000526211508072, 0.00043... 37.729830 \n",
"456 [0.00136758445685, 0.00142817423645, 0.0013712... 37.646990 \n",
".. ... ... \n",
"298 [0.00071127838677, 0.00111997462122, 0.0011171... 2.765757 \n",
"249 [0.000980674719306, 0.000616826034544, 0.00052... 2.509196 \n",
"267 [0.00186721658526, 0.00251852692234, 0.0018583... 2.507279 \n",
"439 [0.00293995593618, 0.00318388799321, 0.0028064... 2.487983 \n",
"278 [0.00267454142488, 0.00239882163089, 0.0023214... 2.473642 \n",
"202 [0.00223126022251, 0.00166447906941, 0.0014980... 2.383827 \n",
"246 [0.000446732468665, 0.000356268750521, 0.00045... 2.349352 \n",
"397 [0.00118286005505, 0.00136679458645, 0.0013184... 2.300288 \n",
"315 [0.00115464921491, 0.00125369026226, 0.0014720... 2.202182 \n",
"62 [0.00173625799333, 0.0024176797299, 0.00162166... 2.146976 \n",
"232 [0.000913378285724, 0.00110274906009, 0.001028... 2.058491 \n",
"15 [0.00158776491079, 0.00184817151368, 0.0015637... 1.700647 \n",
"204 [0.001087271043, 0.000919055750999, 0.00079605... 1.587030 \n",
"346 [0.00205400489143, 0.00160303178002, 0.0029709... 1.479652 \n",
"438 [0.00185247990463, 0.00205992028678, 0.0022074... 1.439927 \n",
"118 [0.00424093655551, 0.0028234087273, 0.00278868... 1.337101 \n",
"475 [0.00268938238552, 0.00325738481028, 0.0020650... 1.297590 \n",
"400 [0.00290464740444, 0.00239695635683, 0.0023634... 1.265491 \n",
"341 [0.0021429648858, 0.00150084378112, 0.00210966... 1.211341 \n",
"364 [0.00306334928857, 0.00353916956988, 0.0029908... 1.133617 \n",
"391 [0.00171471173769, 0.00196388744309, 0.0021903... 1.078767 \n",
"161 [0.00176936684669, 0.00235171025843, 0.0022017... 1.027394 \n",
"442 [0.00364762890397, 0.00197530806939, 0.0033352... 1.015559 \n",
"238 [0.00226944374873, 0.00245019617065, 0.0025590... 0.502013 \n",
"376 [0.00221931648228, 0.00251558123951, 0.0021144... 0.459322 \n",
"498 [0.00186848643965, 0.0022975931405, 0.00218380... 0.351725 \n",
"405 [0.000463265577439, 0.000533759973911, 0.00042... 0.349650 \n",
"340 [0.00170374075911, 0.00134416100933, 0.0017613... 0.192227 \n",
"343 [0.000995545457114, 0.00057913246497, 0.001377... 0.175775 \n",
"440 [0.00112114873198, 0.000513214087504, 0.000618... 0.067421 \n",
"\n",
"[500 rows x 5 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values('z', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment