Skip to content

Instantly share code, notes, and snippets.

@7bitlyrus
Created November 29, 2021 19:29
Show Gist options
  • Save 7bitlyrus/9d427d87bed0f0889d8187085465e6d5 to your computer and use it in GitHub Desktop.
Save 7bitlyrus/9d427d87bed0f0889d8187085465e6d5 to your computer and use it in GitHub Desktop.
alt_detection.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "alt_detection.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyPw1OEeDWIU1bSvb3vfyboO",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/7bitlyrus/9d427d87bed0f0889d8187085465e6d5/alt_detection.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7w_AYCz8Qsvf"
},
"source": [
"Code for this workbook adapted from [jabraunlin/reddit-user-id](https://github.com/jabraunlin/reddit-user-id/blob/master/delta_model.py).\n",
"\n",
"Make sure to add new `*.json` files to the runtime in formatted as: `[{'author': '...', 'content': '...'}, ...]`"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qVBOo_bDQDSy",
"outputId": "647b778e-7950-44a5-8b67-f995dcfdba54"
},
"source": [
"!pip install pyspark\n",
"!wget https://raw.githubusercontent.com/jabraunlin/reddit-user-id/master/skip_grams.csv"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: pyspark in /usr/local/lib/python3.7/dist-packages (3.2.0)\n",
"Requirement already satisfied: py4j==0.10.9.2 in /usr/local/lib/python3.7/dist-packages (from pyspark) (0.10.9.2)\n",
"--2021-11-29 18:47:16-- https://raw.githubusercontent.com/jabraunlin/reddit-user-id/master/skip_grams.csv\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1526 (1.5K) [text/plain]\n",
"Saving to: ‘skip_grams.csv.3’\n",
"\n",
"skip_grams.csv.3 100%[===================>] 1.49K --.-KB/s in 0s \n",
"\n",
"2021-11-29 18:47:16 (36.0 MB/s) - ‘skip_grams.csv.3’ saved [1526/1526]\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "dSlvJyRTQZym"
},
"source": [
"import pyspark as ps\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql.functions import udf, col\n",
"from pyspark.sql.types import StringType, FloatType, IntegerType, ArrayType\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.classification import LogisticRegression\n",
"from pyspark.ml.feature import CountVectorizer, Tokenizer, HashingTF, StandardScaler, Normalizer\n",
"from pyspark.ml.feature import StopWordsRemover\n",
"import re\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import TweetTokenizer\n",
"import pandas as pd\n",
"from nltk.util import skipgrams\n",
"from itertools import chain\n",
"from scipy.cluster import hierarchy\n",
"import csv\n",
"from scipy.stats import norm"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JakHWiBmSknj",
"outputId": "0a5d4c18-b4bf-4b57-90ea-495fa9d01c3a"
},
"source": [
"nltk.download('stopwords')\n",
"nltk.download('averaged_perceptron_tagger')"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /root/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "j3IrK90ORFT1"
},
"source": [
"spark = (\n",
" ps.sql.SparkSession.builder\n",
" .master(\"local[4]\")\n",
" .appName(\"project1\")\n",
" .getOrCreate()\n",
")\n",
"\n",
"sc = spark.sparkContext"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KdhwD8aZUHHq"
},
"source": [
"# Read sharded json files\n",
"df = spark.read.json(\"*.json\")"
],
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "lZnqHryoRLiS"
},
"source": [
"# Find authors that have more than 400 comments so that splitting them leaves us with at least 200 comments\n",
"new_df = df.groupby('author').agg(F.count('content'))\n",
"authors = new_df.filter(new_df['count(content)'] > 400).select('author')"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "LACO-GbZRmKW"
},
"source": [
"# Filter the original data with authors that have more than 400 comments\n",
"filtered_df = authors.join(df, ['author'], 'left')"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "oSxVJiSgRnlI"
},
"source": [
"# Split users into users and pseudo-users to compare them\n",
"df1, df2 = filtered_df.randomSplit([0.5, 0.5])"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XCAGwo9oRot4"
},
"source": [
"# Concatenate comments into corpora of each user's entire comment history\n",
"join_comments_udf = udf(lambda x: ' '.join(x), StringType())\n",
"\n",
"comments1 = df1.groupBy(\"author\").agg(F.collect_list(\"content\"))\n",
"df1_join_comments = comments1.withColumn('corpus', join_comments_udf(comments1['collect_list(content)']))\n",
"\n",
"comments2 = df2.groupBy(\"author\").agg(F.collect_list(\"content\"))\n",
"df2_join_comments = comments2.withColumn('corpus', join_comments_udf(comments2['collect_list(content)']))"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "s0vAW9sBRToB"
},
"source": [
"# Data Cleaning - Count Links\n",
"def count_links(s):\n",
" try:\n",
" num_links = len(re.findall(r'\\(http.+\\)', s)[0].split(')('))\n",
" return num_links\n",
" except:\n",
" return 0\n",
"\n",
"count_links_udf = udf(count_links, IntegerType())\n",
"\n",
"df_count_links1 = df1_join_comments.withColumn('link_count', count_links_udf(df1_join_comments['corpus']))\n",
"df_count_links2 = df2_join_comments.withColumn('link_count', count_links_udf(df2_join_comments['corpus']))"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "C0z4z0bOThuS"
},
"source": [
"# Drop Links\n",
"def drop_links(s):\n",
" return re.sub(r'\\(http.+\\)', '', s)\n",
"\n",
"drop_links_udf = udf(drop_links, StringType())\n",
"\n",
"df_drop_links1 = df_count_links1.withColumn('corpus', drop_links_udf(df_count_links1['corpus']))\n",
"df_drop_links2 = df_count_links2.withColumn('corpus', drop_links_udf(df_count_links2['corpus']))"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "YsEILx07RVNm"
},
"source": [
"# Tokenize words\n",
"def tokenize(s):\n",
" s = s.lower()\n",
" token = TweetTokenizer()\n",
" return token.tokenize(s)\n",
"\n",
"tokenize_udf = udf(tokenize, ArrayType(StringType()))\n",
"\n",
"df_tokens1 = df_drop_links1.withColumn('tokens', tokenize_udf(df_drop_links1['corpus']))\n",
"df_tokens2 = df_drop_links2.withColumn('tokens', tokenize_udf(df_drop_links2['corpus']))"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uSjT8O1GRXZ7"
},
"source": [
"# Tag parts of speech for each word\n",
"def pos_tagger(s):\n",
" return [i[1] for i in nltk.pos_tag(s)]\n",
"\n",
"pos_tagger_udf = udf(pos_tagger, ArrayType(StringType()))\n",
"df_pos_tagger1 = df_tokens1.withColumn('POS', pos_tagger_udf(df_tokens1['tokens']))\n",
"\n",
"pos_tagger_udf = udf(pos_tagger, ArrayType(StringType()))\n",
"df_pos_tagger2 = df_tokens2.withColumn('POS', pos_tagger_udf(df_tokens2['tokens']))"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "0iAH5oN2RZWR"
},
"source": [
"# Find POS tagging tendencies to determine sentence structure patterns\n",
"def skip_grams(s):\n",
" grams = []\n",
" for i in skipgrams(s, 2, 2):\n",
" grams.append(str(i))\n",
" return grams\n",
"\n",
"skip_grams_udf = udf(skip_grams, ArrayType(StringType()))\n",
"\n",
"df_skip_grams1 = df_pos_tagger1.withColumn('skip_grams', skip_grams_udf(df_pos_tagger1['POS']))\n",
"df_skip_grams2 = df_pos_tagger2.withColumn('skip_grams', skip_grams_udf(df_pos_tagger2['POS']))"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FfFLMr3mRbl3"
},
"source": [
"# Open file containing the most common skip grams made previously\n",
"with open('skip_grams.csv', 'r') as f:\n",
" reader = csv.reader(f)\n",
" com_skips = list(reader)\n",
"\n",
"skips = com_skips[0]"
],
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "eF0Z1uyDRegW"
},
"source": [
"# Filter through each user's POS skip-grams and keep them if they are in the most commonly found skip-grams\n",
"def skip_grams_filter(s):\n",
" return [i for i in s if i in skips]\n",
"\n",
"com_skips_udf = udf(skip_grams_filter, ArrayType(StringType()))\n",
"\n",
"df_com_skips1 = df_skip_grams1.withColumn('com_skips', com_skips_udf(df_skip_grams1['skip_grams']))\n",
"df_com_skips2 = df_skip_grams2.withColumn('com_skips', com_skips_udf(df_skip_grams2['skip_grams']))"
],
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KmeSZOOkRgIw"
},
"source": [
"# Create stop words feature list and add extra features\n",
"stops = stopwords.words('english')\n",
"x = [i.split(\"'\")for i in stops]\n",
"stops = [i[0] for i in x]\n",
"stops = list(set(stops))\n",
"slang_stops = ['gonna', 'coulda', 'shoulda',\n",
" 'lotta', 'lots', 'oughta', 'gotta', 'ain', 'sorta', 'kinda', 'yeah', 'whatever', 'cuz', 'ya', 'haha', 'lol', 'eh']\n",
"puncts = ['!', ':', '...', '.', '%', '$', \"'\", '\"', ';']\n",
"formattings = ['##', '__', '_', ' ', '*', '**']\n",
"\n",
"stops.extend(slang_stops)\n",
"stops.extend(puncts)\n",
"stops.extend(formattings)\n",
"stops.extend(skips)"
],
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vrWlRzn6RkWY"
},
"source": [
"# Filter words with the list of stop words\n",
"def stop_words_filter(s):\n",
" return [i for i in s if i in stops]\n",
"\n",
"stop_words_udf = udf(stop_words_filter, ArrayType(StringType()))\n",
"\n",
"df_stop_words1 = df_com_skips1.withColumn('stop_words', stop_words_udf(df_com_skips1['tokens']))\n",
"df_stop_words2 = df_com_skips2.withColumn('stop_words', stop_words_udf(df_com_skips2['tokens']))"
],
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Xbh3FAezRr4Q"
},
"source": [
"# Concatenate each user's list of function words and skip grams to a single array\n",
"def concat(type):\n",
" def concat_(*args):\n",
" return list(chain.from_iterable((arg if arg else [] for arg in args)))\n",
" return udf(concat_, ArrayType(type))\n",
"\n",
"concat_arrays_udf = concat(StringType())\n",
"\n",
"df_all_words1 = df_stop_words1.select(\"author\", concat_arrays_udf(\"stop_words\", \"com_skips\"))\n",
"df_all_words2 = df_stop_words2.select(\"author\", concat_arrays_udf(\"stop_words\", \"com_skips\"))"
],
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "H2FdHd4fRtMt"
},
"source": [
"# Count Vectorize the combined function word and skip gram array\n",
"hashingTF = HashingTF(numFeatures=285, inputCol='concat_(stop_words, com_skips)', outputCol='features')\n",
"\n",
"tf1 = hashingTF.transform(df_all_words1)\n",
"tf2 = hashingTF.transform(df_all_words2)"
],
"execution_count": 20,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZdFQO52tRurE"
},
"source": [
"# Normalize the counts so that they are a percentage of total counts of the features\n",
"tf_norm1 = Normalizer(inputCol=\"features\", outputCol=\"features_norm\", p=1).transform(tf1)\n",
"tf_norm2 = Normalizer(inputCol=\"features\", outputCol=\"features_norm\", p=1).transform(tf2)"
],
"execution_count": 21,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4N1jGUvdRv-Q"
},
"source": [
"# Standardize the vector based on average use of each feature among all users\n",
"stdscaler = StandardScaler(inputCol='features_norm', outputCol='scaled', withMean=True)\n",
"scale_fit1 = stdscaler.fit(tf_norm1)\n",
"\n",
"scaled1 = scale_fit1.transform(tf_norm1)\n",
"scaled2 = scale_fit1.transform(tf_norm2)"
],
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "rYnE6BV_RzPm"
},
"source": [
"# Calculate the cosine similarity for each author in subset 1 against every author in subset 2\n",
"sims1 = scaled1.select('author', 'scaled')\n",
"sims2 = scaled2.select('author', 'scaled')\n",
"similarities = {}\n",
"for i in sims1.rdd.collect():\n",
" similarity = {}\n",
" auth1, vec1 = i[0], i[1]\n",
" for j in sims2.rdd.collect():\n",
" auth2, vec2 = j[0], j[1]\n",
" cos = vec1.dot(vec2) / (vec2.norm(2)*vec1.norm(2))\n",
" similarity[auth2] = cos\n",
" similarities[auth1] = similarity"
],
"execution_count": 23,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2ZJkiJOlR2Es"
},
"source": [
"pdf = pd.DataFrame(similarities)"
],
"execution_count": 24,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bGuTq_7LR5BJ"
},
"source": [
"# split the cosines of authors who match with the authors who don't match\n",
"cols = pdf.columns\n",
"mask = []\n",
"for i in pdf:\n",
" mask.append(i == pdf.index)\n",
"mask = np.array(mask)\n",
"mask = mask.T\n",
"\n",
"matches = pdf.values[mask]\n",
"non_matches = pdf.values[~mask]"
],
"execution_count": 25,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2F1tdVIcR5ad",
"outputId": "7aa2b955-38cc-4ea4-ae90-dafd835cc154"
},
"source": [
"# Calculate accuracy of the model\n",
"non_mas = non_matches.reshape(len(matches), -1)\n",
"non_mas_max = np.max(non_mas, axis=1)\n",
"np.sum(matches > non_mas_max) / len(matches)"
],
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1.0"
]
},
"metadata": {},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "jzTOm3uGR7Bu"
},
"source": [
"match_list = matches\n",
"nonma_list = non_matches"
],
"execution_count": 27,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3CiBzf_TR-Cc",
"outputId": "5f052e5b-0ff0-40c7-f828-1a20046e7ec9"
},
"source": [
"# Calculate cosine threshold and power for a given alpha level\n",
"n = norm.ppf(0.9999) * np.std(nonma_list) - np.mean(nonma_list)\n",
"\n",
"1 - norm.cdf(n, np.mean(match_list), np.std(match_list))"
],
"execution_count": 28,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.9544371747821463"
]
},
"metadata": {},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "F1uzcnjNR_qE"
},
"source": [
"# Plotting - Dendogram\n",
"sparkdf = scaled1.select('author', 'scaled')\n",
"pandaDF = sparkdf.toPandas()"
],
"execution_count": 29,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 347
},
"id": "V_nmSDuFAuNo",
"outputId": "25038908-93d6-4483-d377-2d98372e44a3"
},
"source": [
"series = pandaDF['scaled'].apply(lambda x: np.array(x.toArray())).values.reshape(-1, 1)\n",
"features = np.apply_along_axis(lambda x: x[0], 1, series)\n",
"df = pd.DataFrame(features, index=pandaDF['author'])\n",
"\n",
"threshold = 0.405\n",
"Z = hierarchy.linkage(df, 'single', metric=\"cosine\")\n",
"hierarchy.set_link_color_palette(None)\n",
"\n",
"fig, axes = plt.subplots(1, 1, figsize=(len(df.index)*1.25, 7))\n",
"hierarchy.dendrogram(Z, ax=axes, color_threshold=threshold, labels=df.index)\n",
"axes.axhline(y=0.405, color='r', linestyle='-', label='threshold')\n",
"axes.set_ylabel('1 - Cosine')\n",
"axes.set_title('Hierarchical Clustering')\n",
"plt.tight_layout()\n",
"plt.legend()\n",
"plt.savefig('dendogram.png')"
],
"execution_count": 30,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 2250x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 279
},
"id": "NE3csDuWSAFN",
"outputId": "42a5bbc1-85da-4325-e2cb-7fe4d7c9052f"
},
"source": [
"# Matches and non-matches hist\n",
"plt.hist(matches, label='matches')\n",
"plt.hist(non_matches, label='non-matches')\n",
"plt.xlabel('Cosine Similarity')\n",
"plt.legend()\n",
"plt.savefig('match_distro.png')"
],
"execution_count": 31,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
}
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment