jmasonherr/Collect Parler Metadata.ipynb

## _readme.md

      
    Raw
  

              _readme.md
            
          
    Parler video GPS data

Download the collected CSV data here.
Using metadata collected by https://twitter.com/donk_enby
Many videos are still available as of January 11th, 2021.

Add 8.240.242.124 video.parler.com to your /etc/hosts file.
Go to the URL f'https://video.parler.com/{e[:2]}/{e[2:4]}/{e}.mp4'. Here is an example.

No rights reserved


## Collect Parler Metadata.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, download [python-utils](https://github.com/kylemcdonald/python-utils) to the `utils` folder.\n",
    "\n",
    "`pip install ujson` for fast json loading."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.list_files import list_files\n",
    "from utils.progress import progress_parallel\n",
    "\n",
    "import ujson as json\n",
    "\n",
    "from itertools import islice\n",
    "from collections import Counter\n",
    "import datetime\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load all the `.json` files from the `metadata/` folder. Should take two minutes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1032523 0:02:11 7910.85/s\n"
     ]
    }
   ],
   "source": [
    "def load(fn):\n",
    "    with open(fn) as f:\n",
    "        data = json.load(f)\n",
    "    video_id = os.path.basename(fn).split('.')[0].split('-')[1]\n",
    "    for i in range(len(data)):\n",
    "        data[i]['tag'] = video_id\n",
    "    return data\n",
    "\n",
    "metadata = progress_parallel(load, list_files('metadata'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save all the results to a pickle file. This way we can reload quickly later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1032523"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pickle\n",
    "with open('metadata.pkl', 'wb') as f:\n",
    "    pickle.dump(metadata, f)\n",
    "    \n",
    "# with open('metadata.pkl', 'rb') as f:\n",
    "#     metadata = pickle.load(f)\n",
    "\n",
    "len(metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# show common metadata keys\n",
    "# all_keys = []\n",
    "# for m in metadata:\n",
    "#     for e in m:\n",
    "#         all_keys.extend(e.keys())\n",
    "# counts = Counter(all_keys)\n",
    "# print(counts.most_common(20))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dms_to_dd(dms):\n",
    "    parts = dms.split(' ')\n",
    "    d = int(parts[0])\n",
    "    m = int(parts[2][:-1])\n",
    "    s = float(parts[3][:-1])\n",
    "    dd = d + float(m)/60 + float(s)/3600\n",
    "    if parts[-1] == 'W' or parts[-1] == 'S':\n",
    "        dd *= -1\n",
    "    return dd\n",
    "\n",
    "def geocoded():\n",
    "    for m in metadata:\n",
    "        for e in m:\n",
    "            try:\n",
    "                lon = e['GPSLongitude']\n",
    "                lat = e['GPSLatitude']\n",
    "                created = e['CreateDate']\n",
    "                video_id = e['tag']\n",
    "                lon = dms_to_dd(lon)\n",
    "                lat = dms_to_dd(lat)\n",
    "                created = datetime.datetime.strptime(created, '%Y:%m:%d %H:%M:%S')\n",
    "                yield (lon, lat, created, video_id)\n",
    "            except KeyError:\n",
    "                continue\n",
    "            except ValueError:\n",
    "                continue\n",
    "\n",
    "# collect all geocoded data\n",
    "collected = list(geocoded())\n",
    "\n",
    "# write all data to a single file\n",
    "# with open('all-geocoded.csv', 'w') as f:\n",
    "#     f.write('Longitude,Latitude,Timestamp,ID\\n')\n",
    "#     for lon, lat, created, video_id in collected:\n",
    "#         f.write(f'{lon},{lat},{created},{video_id}\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define bounding box and start date."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# capitol only\n",
    "# top,left = [38.892110, -77.015188]\n",
    "# bottom,right = [38.887619, -77.005959]\n",
    "\n",
    "# dc area\n",
    "top,left = [38.978921, -77.140698]\n",
    "bottom,right = [38.808277, -76.914339]\n",
    "\n",
    "start = datetime.datetime(2021,1,5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1499"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_download = []\n",
    "for lon,lat,created,video_id in collected:\n",
    "    if lon < left or lon > right or lat > top or lat < bottom:\n",
    "        continue\n",
    "    if created < start:\n",
    "        continue\n",
    "    to_download.append(video_id)\n",
    "    \n",
    "len(to_download)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save all urls to a text file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dc-area.txt', 'w') as f:\n",
    "    for e in to_download:\n",
    "        f.write(f'https://video.parler.com/{meta[:2]}/{meta[2:4]}/{meta}.mp4\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download all the urls in parallel by using [this downloader script](https://gist.github.com/kylemcdonald/3cbd09752e340849e4b3cb4f12dd8c85).\n",
    "\n",
    "`python download-urls.py -l ../washington-area.txt`\n",
    "\n",
    "You will need to edit your `/etc/hosts` to add the following line:\n",
    "    \n",
    "`8.240.242.124 video.parler.com`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

## geocoded.jpg

      
    Raw
  

              geocoded.jpg
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"First, download [python-utils](https://github.com/kylemcdonald/python-utils) to the `utils` folder.\n",
	"\n",
	"`pip install ujson` for fast json loading."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"from utils.list_files import list_files\n",
	"from utils.progress import progress_parallel\n",
	"\n",
	"import ujson as json\n",
	"\n",
	"from itertools import islice\n",
	"from collections import Counter\n",
	"import datetime\n",
	"import os"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Load all the `.json` files from the `metadata/` folder. Should take two minutes."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1032523 0:02:11 7910.85/s\n"
	]
	}
	],
	"source": [
	"def load(fn):\n",
	" with open(fn) as f:\n",
	" data = json.load(f)\n",
	" video_id = os.path.basename(fn).split('.')[0].split('-')[1]\n",
	" for i in range(len(data)):\n",
	" data[i]['tag'] = video_id\n",
	" return data\n",
	"\n",
	"metadata = progress_parallel(load, list_files('metadata'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Save all the results to a pickle file. This way we can reload quickly later."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1032523"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pickle\n",
	"with open('metadata.pkl', 'wb') as f:\n",
	" pickle.dump(metadata, f)\n",
	" \n",
	"# with open('metadata.pkl', 'rb') as f:\n",
	"# metadata = pickle.load(f)\n",
	"\n",
	"len(metadata)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# show common metadata keys\n",
	"# all_keys = []\n",
	"# for m in metadata:\n",
	"# for e in m:\n",
	"# all_keys.extend(e.keys())\n",
	"# counts = Counter(all_keys)\n",
	"# print(counts.most_common(20))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"def dms_to_dd(dms):\n",
	" parts = dms.split(' ')\n",
	" d = int(parts[0])\n",
	" m = int(parts[2][:-1])\n",
	" s = float(parts[3][:-1])\n",
	" dd = d + float(m)/60 + float(s)/3600\n",
	" if parts[-1] == 'W' or parts[-1] == 'S':\n",
	" dd *= -1\n",
	" return dd\n",
	"\n",
	"def geocoded():\n",
	" for m in metadata:\n",
	" for e in m:\n",
	" try:\n",
	" lon = e['GPSLongitude']\n",
	" lat = e['GPSLatitude']\n",
	" created = e['CreateDate']\n",
	" video_id = e['tag']\n",
	" lon = dms_to_dd(lon)\n",
	" lat = dms_to_dd(lat)\n",
	" created = datetime.datetime.strptime(created, '%Y:%m:%d %H:%M:%S')\n",
	" yield (lon, lat, created, video_id)\n",
	" except KeyError:\n",
	" continue\n",
	" except ValueError:\n",
	" continue\n",
	"\n",
	"# collect all geocoded data\n",
	"collected = list(geocoded())\n",
	"\n",
	"# write all data to a single file\n",
	"# with open('all-geocoded.csv', 'w') as f:\n",
	"# f.write('Longitude,Latitude,Timestamp,ID\\n')\n",
	"# for lon, lat, created, video_id in collected:\n",
	"# f.write(f'{lon},{lat},{created},{video_id}\\n')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Define bounding box and start date."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {},
	"outputs": [],
	"source": [
	"# capitol only\n",
	"# top,left = [38.892110, -77.015188]\n",
	"# bottom,right = [38.887619, -77.005959]\n",
	"\n",
	"# dc area\n",
	"top,left = [38.978921, -77.140698]\n",
	"bottom,right = [38.808277, -76.914339]\n",
	"\n",
	"start = datetime.datetime(2021,1,5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1499"
	]
	},
	"execution_count": 56,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"to_download = []\n",
	"for lon,lat,created,video_id in collected:\n",
	" if lon < left or lon > right or lat > top or lat < bottom:\n",
	" continue\n",
	" if created < start:\n",
	" continue\n",
	" to_download.append(video_id)\n",
	" \n",
	"len(to_download)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Save all urls to a text file."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 52,
	"metadata": {},
	"outputs": [],
	"source": [
	"with open('dc-area.txt', 'w') as f:\n",
	" for e in to_download:\n",
	" f.write(f'https://video.parler.com/{meta[:2]}/{meta[2:4]}/{meta}.mp4\\n')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download all the urls in parallel by using [this downloader script](https://gist.github.com/kylemcdonald/3cbd09752e340849e4b3cb4f12dd8c85).\n",
	"\n",
	"`python download-urls.py -l ../washington-area.txt`\n",
	"\n",
	"You will need to edit your `/etc/hosts` to add the following line:\n",
	" \n",
	"`8.240.242.124 video.parler.com`"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}