Skip to content

Instantly share code, notes, and snippets.

@nanaBaah
Created January 27, 2019 11:50
Show Gist options
  • Save nanaBaah/06803b443a2956c84ca03d1c09a9467b to your computer and use it in GitHub Desktop.
Save nanaBaah/06803b443a2956c84ca03d1c09a9467b to your computer and use it in GitHub Desktop.
Deception Detection
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Project: Natural Language Processing and the Web\n",
"\n",
"## **Open-Domain Deception Detection**\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>.container { width:98% !important; }</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scattertext as st\n",
"import spacy\n",
"\n",
"from IPython.display import IFrame\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"<style>.container { width:98% !important; }</style>\"))\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"assert st.__version__ >= '0.0.2.25'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### **Truth and Lies Data set**\n",
"\n",
"#### Dataset consists of labelled text of truth and lies.\n",
"\n",
"#### Dataset is from https://web.eecs.umich.edu/~mihalcea/downloads.html#OpenDeception\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Negative 1811\n",
"Positive 1773\n",
"Name: label, dtype: int64\n"
]
}
],
"source": [
"rdf = pd.read_csv('train.txt', sep='\\t')\n",
"rdf['label'] = rdf['label'].apply(lambda x : {'lie': 'Negative', 'truth': 'Positive'}[x])\n",
"print(rdf.label.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"corpus = (st.CorpusFromPandas(rdf, \n",
" category_col='label', \n",
" text_col='text',\n",
" nlp = st.whitespace_nlp_with_sentences)\n",
" .build())\n",
"corpus.get_term_freq_df().to_csv('term_freqs.csv')\n",
"unigram_corpus = corpus.get_unigram_corpus()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### **Visualization of the corpus using Scattertext**\n",
"\n",
"#### The x-axis indicates the rank of a word or bigram in the set of positive (truth) labels, and the y-axis represents negative (lie) labels.\n",
"\n",
"#### Ranks are determined using the \"dense\" ranking, meaning the most frequent terms, regardless of ties, are given rank 1, the next most frequent terms, regardless of ties, are given rank 2, and so on.\n",
"\n",
"#### Scattertext selectively labels points in such a way as to prevent labels from overlapping other elements of the graph. \n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nana/anaconda3/envs/nlp/lib/python3.6/site-packages/scattertext/frequencyreaders/DefaultBackgroundFrequencies.py:30: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n",
" names=['word', 'background'])\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"lies_truth_detection.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06bdd29da0>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html = st.produce_scattertext_explorer(\n",
" corpus,\n",
" category='Positive',\n",
" not_categories=['Negative'],\n",
" sort_by_dist=False,\n",
" term_scorer=st.RankDifference(),\n",
" transform=st.Scalers.percentile_dense\n",
")\n",
"file_name = 'lies_truth_detection.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nana/anaconda3/envs/nlp/lib/python3.6/site-packages/scattertext/frequencyreaders/DefaultBackgroundFrequencies.py:30: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n",
" names=['word', 'background'])\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"lies_truth_analyzing.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06bda68c50>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html = st.produce_scattertext_explorer(\n",
" corpus,\n",
" category='Positive',\n",
" not_categories=['Negative'],\n",
" sort_by_dist=False,\n",
" term_scorer=st.RankDifference(),\n",
")\n",
"file_name = 'lies_truth_analyzing.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### **Scaled F-Score**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import hmean\n",
"\n",
"term_freq_df = corpus.get_unigram_corpus().get_term_freq_df()[['Positive freq', 'Negative freq']]\n",
"term_freq_df = term_freq_df[term_freq_df.sum(axis=1) > 0]\n",
"\n",
"term_freq_df['pos_precision'] = (term_freq_df['Positive freq'] * 1./\n",
" (term_freq_df['Positive freq'] + term_freq_df['Negative freq']))\n",
"\n",
"term_freq_df['pos_freq_pct'] = (term_freq_df['Positive freq'] * 1.\n",
" /term_freq_df['Positive freq'].sum())\n",
"\n",
"term_freq_df['pos_hmean'] = (term_freq_df\n",
" .apply(lambda x: (hmean([x['pos_precision'], x['pos_freq_pct']])\n",
" if x['pos_precision'] > 0 and x['pos_freq_pct'] > 0 \n",
" else 0), axis=1))\n",
"# term_freq_df.sort_values(by='pos_hmean', ascending=False).iloc[:10]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 5282.000000\n",
"mean 0.000189\n",
"std 0.001359\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000069\n",
"75% 0.000138\n",
"max 0.050526\n",
"Name: pos_freq_pct, dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"term_freq_df.pos_freq_pct.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 5282.000000\n",
"mean 0.489611\n",
"std 0.421531\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.500000\n",
"75% 1.000000\n",
"max 1.000000\n",
"Name: pos_precision, dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"term_freq_df.pos_precision.describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"not_normed_freq_prec.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06b7a8d0b8>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq = term_freq_df.pos_freq_pct.values\n",
"prec = term_freq_df.pos_precision.values\n",
"html = st.produce_scattertext_explorer(\n",
" corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)),\n",
" category='Positive',\n",
" not_category_name='Negative',\n",
" not_categories=['Negative'],\n",
" \n",
" x_label = 'Portion of words used in positive reviews',\n",
" original_x = freq,\n",
" x_coords = (freq - freq.min())/freq.max(),\n",
" x_axis_values = [int(freq.min()*1000)/1000., \n",
" int(freq.max() * 1000)/1000.],\n",
" \n",
" y_label = 'Portion of documents containing word that are positive', \n",
" original_y = prec,\n",
" y_coords = (prec - prec.min())/prec.max(),\n",
" y_axis_values = [int(prec.min() * 1000)/1000., \n",
" int((prec.max()/2.)*1000)/1000., \n",
" int(prec.max() * 1000)/1000.],\n",
" scores = term_freq_df.pos_hmean.values,\n",
" \n",
" sort_by_dist=False,\n",
" show_characteristic=False\n",
")\n",
"file_name = 'not_normed_freq_prec.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x720 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"import seaborn as sns\n",
"from scipy.stats import norm\n",
"\n",
"fig, ax = plt.subplots(figsize=(15,10))\n",
"freqs = term_freq_df.pos_freq_pct[term_freq_df.pos_freq_pct > 0]\n",
"log_freqs = np.log(freqs)\n",
"\n",
"sns.distplot(log_freqs[:1000], kde=False, rug=True, hist=False, rug_kws={\"color\": \"k\"})\n",
"\n",
"x = np.linspace(log_freqs.min(), \n",
" log_freqs.max(), \n",
" 100)\n",
"frozen_norm = norm(log_freqs.mean(), log_freqs.std())\n",
"y = frozen_norm.pdf(x)\n",
"plt.plot(x, y ,color='k')\n",
"term = 'game'\n",
"word_freq = log_freqs.loc[term]\n",
"term_cdf = frozen_norm.cdf(word_freq)\n",
"plt.axvline(x=word_freq, color='red', label='Log frequency of \"'+term+'\"')\n",
"plt.fill_between(x[x < word_freq], \n",
" y[x < word_freq], y[x < word_freq] * 0, \n",
" facecolor='blue', \n",
" alpha=0.5,\n",
" label=\"Log-normal CDF of %s: $%0.3f \\in [0,1]$\" % (term, term_cdf) )\n",
"ax.set_xlabel('Log term frequency')\n",
"ax.set_ylabel('Cumulative term probability')\n",
"plt.legend()\n",
"for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +\n",
" ax.get_xticklabels() + ax.get_yticklabels() ):\n",
" item.set_fontsize(20)\n",
"plt.rc('legend', fontsize=20) \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import norm\n",
"\n",
"def normcdf(x):\n",
" return norm.cdf(x, x.mean(), x.std ())\n",
"\n",
"term_freq_df['pos_precision_normcdf'] = normcdf(term_freq_df.pos_precision)\n",
"\n",
"term_freq_df['pos_freq_pct_normcdf'] = normcdf(term_freq_df.pos_freq_pct.values)\n",
"\n",
"term_freq_df['pos_scaled_f_score'] = hmean([term_freq_df['pos_precision_normcdf'], term_freq_df['pos_freq_pct_normcdf']])\n",
"\n",
"#term_freq_df.sort_values(by='pos_scaled_f_score', ascending=False).iloc[:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nana/anaconda3/envs/nlp/lib/python3.6/site-packages/scattertext/frequencyreaders/DefaultBackgroundFrequencies.py:30: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n",
" names=['word', 'background'])\n"
]
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"normed_freq_prec.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06b87e95f8>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq = term_freq_df.pos_freq_pct_normcdf.values\n",
"prec = term_freq_df.pos_precision_normcdf.values\n",
"html = st.produce_scattertext_explorer(\n",
" corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)),\n",
" category='Positive',\n",
" not_category_name='Negative',\n",
" not_categories=['Negative'],\n",
" \n",
" x_label = 'Portion of words used in positive reviews (norm-cdf)',\n",
" original_x = freq,\n",
" x_coords = (freq - freq.min())/freq.max(),\n",
" x_axis_values = [int(freq.min()*1000)/1000., \n",
" int(freq.max() * 1000)/1000.],\n",
" \n",
" y_label = 'documents containing word that are positive (norm-cdf)', \n",
" original_y = prec,\n",
" y_coords = (prec - prec.min())/prec.max(),\n",
" y_axis_values = [int(prec.min() * 1000)/1000., \n",
" int((prec.max()/2.)*1000)/1000., \n",
" int(prec.max() * 1000)/1000.],\n",
" scores = term_freq_df.pos_scaled_f_score.values,\n",
" \n",
" sort_by_dist=False,\n",
" show_characteristic=False\n",
")\n",
"file_name = 'normed_freq_prec.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"term_freq_df['neg_precision_normcdf'] = normcdf((term_freq_df['Negative freq'] * 1./\n",
" (term_freq_df['Negative freq'] + term_freq_df['Positive freq'])))\n",
"\n",
"term_freq_df['neg_freq_pct_normcdf'] = normcdf((term_freq_df['Negative freq'] * 1.\n",
" /term_freq_df['Negative freq'].sum()))\n",
"\n",
"term_freq_df['neg_scaled_f_score'] = hmean([term_freq_df['neg_precision_normcdf'], term_freq_df['neg_freq_pct_normcdf']])\n",
"\n",
"term_freq_df['scaled_f_score'] = 0\n",
"term_freq_df.loc[term_freq_df['pos_scaled_f_score'] > term_freq_df['neg_scaled_f_score'], \n",
" 'scaled_f_score'] = term_freq_df['pos_scaled_f_score']\n",
"term_freq_df.loc[term_freq_df['pos_scaled_f_score'] < term_freq_df['neg_scaled_f_score'], \n",
" 'scaled_f_score'] = 1-term_freq_df['neg_scaled_f_score']\n",
"term_freq_df['scaled_f_score'] = 2 * (term_freq_df['scaled_f_score'] - 0.5)\n",
"#term_freq_df.sort_values(by='scaled_f_score', ascending=False).iloc[:10]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"sfs_explain.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06b69df7f0>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"is_pos = term_freq_df.pos_scaled_f_score > term_freq_df.neg_scaled_f_score\n",
"freq = term_freq_df.pos_freq_pct_normcdf*is_pos - term_freq_df.neg_freq_pct_normcdf*~is_pos\n",
"prec = term_freq_df.pos_precision_normcdf*is_pos - term_freq_df.neg_precision_normcdf*~is_pos\n",
"def scale(ar): \n",
" return (ar - ar.min())/(ar.max() - ar.min())\n",
"def close_gap(ar): \n",
" ar[ar > 0] -= ar[ar > 0].min()\n",
" ar[ar < 0] -= ar[ar < 0].max()\n",
" return ar\n",
"\n",
"html = st.produce_scattertext_explorer(\n",
" corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)),\n",
" category='Positive',\n",
" not_category_name='Negative',\n",
" not_categories=['Negative'],\n",
" \n",
" x_label = 'Frequency',\n",
" original_x = freq,\n",
" x_coords = scale(close_gap(freq)),\n",
" x_axis_labels = ['Frequent in Neg', \n",
" 'Not Frequent', \n",
" 'Frequent in Pos'],\n",
" \n",
" y_label = 'Precision', \n",
" original_y = prec,\n",
" y_coords = scale(close_gap(prec)),\n",
" y_axis_labels = ['Neg Precise', \n",
" 'Imprecise', \n",
" 'Pos Precise'],\n",
" \n",
" \n",
" scores = (term_freq_df.scaled_f_score.values + 1)/2,\n",
" sort_by_dist=False,\n",
" show_characteristic=False\n",
")\n",
"file_name = 'sfs_explain.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"1300\"\n",
" height=\"700\"\n",
" src=\"freq_sfs.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x7f06b69d3978>"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html = st.produce_frequency_explorer(\n",
" corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)),\n",
" category='Positive',\n",
" not_category_name='Negative',\n",
" not_categories=['Negative'],\n",
" term_scorer=st.ScaledFScorePresets(beta=1, one_to_neg_one=True),\n",
" grey_threshold=0\n",
")\n",
"file_name = 'freq_sfs.html'\n",
"open(file_name, 'wb').write(html.encode('utf-8'))\n",
"IFrame(src=file_name, width = 1300, height=700)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment