sanand0/.gitignore

## .gitignore
*.7z
*.list
*.txt

## imdb-actors.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IMDb actors & actresses by movie\n",
    "\n",
    "Use this script to download the files:\n",
    "\n",
    "    # Download files\n",
    "    wget ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/actresses.list.gz\n",
    "    wget ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/actors.list.gz\n",
    "\n",
    "    # Unzip them\n",
    "    gzip --force --decompress *.gz\n",
    "    \n",
    "Move the files to the current folder and run the rest of this script."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def parse_movie(info):\n",
    "    parts = info.split('  ')\n",
    "    return parts[0]\n",
    "\n",
    "def parse(input_file, output_file):\n",
    "    with io.open(input_file, encoding='latin1') as input, io.open(output_file, 'w', encoding='utf8') as output:\n",
    "        # Skip everything up to the line starting with \"Name\"\n",
    "        for row in input:\n",
    "            if row.startswith('Name\\t'):\n",
    "                next(input)\n",
    "                break\n",
    "\n",
    "        # Parse each line into output\n",
    "        person = None\n",
    "        for row in input:\n",
    "            if not row.strip():\n",
    "                continue\n",
    "            if '\\t' not in row:             # Stop gracefully at the end\n",
    "                break\n",
    "            if not row.startswith('\\t'):\n",
    "                person, info = row.split('\\t', 1)\n",
    "            else:\n",
    "                info = row\n",
    "            movie = parse_movie(info.strip())\n",
    "            if ('\"' not in movie                     # ignore TV series\n",
    "                and ' (TV)' not in movie             # ignore TV series\n",
    "                and ' (V)' not in movie              # ignore videos\n",
    "                and ' {{SUSPENDED}}' not in movie    # or suspended movies\n",
    "            ):\n",
    "                output.write(person + '\\t' + movie + '\\n')\n",
    "\n",
    "parse('actors.list', 'actors.txt')\n",
    "parse('actresses.list', 'actresses.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To extract the movies for a given year (e.g. 2010), use:\n",
    "\n",
    "    grep -F ' (2010)' actors.txt actresses.txt > 2010.txt"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# IMDb actors & actresses by movie\n",
	"\n",
	"Use this script to download the files:\n",
	"\n",
	" # Download files\n",
	" wget ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/actresses.list.gz\n",
	" wget ftp://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/actors.list.gz\n",
	"\n",
	" # Unzip them\n",
	" gzip --force --decompress *.gz\n",
	" \n",
	"Move the files to the current folder and run the rest of this script."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import io\n",
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def parse_movie(info):\n",
	" parts = info.split(' ')\n",
	" return parts[0]\n",
	"\n",
	"def parse(input_file, output_file):\n",
	" with io.open(input_file, encoding='latin1') as input, io.open(output_file, 'w', encoding='utf8') as output:\n",
	" # Skip everything up to the line starting with \"Name\"\n",
	" for row in input:\n",
	" if row.startswith('Name\\t'):\n",
	" next(input)\n",
	" break\n",
	"\n",
	" # Parse each line into output\n",
	" person = None\n",
	" for row in input:\n",
	" if not row.strip():\n",
	" continue\n",
	" if '\\t' not in row: # Stop gracefully at the end\n",
	" break\n",
	" if not row.startswith('\\t'):\n",
	" person, info = row.split('\\t', 1)\n",
	" else:\n",
	" info = row\n",
	" movie = parse_movie(info.strip())\n",
	" if ('\"' not in movie # ignore TV series\n",
	" and ' (TV)' not in movie # ignore TV series\n",
	" and ' (V)' not in movie # ignore videos\n",
	" and ' {{SUSPENDED}}' not in movie # or suspended movies\n",
	" ):\n",
	" output.write(person + '\\t' + movie + '\\n')\n",
	"\n",
	"parse('actors.list', 'actors.txt')\n",
	"parse('actresses.list', 'actresses.txt')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"To extract the movies for a given year (e.g. 2010), use:\n",
	"\n",
	" grep -F ' (2010)' actors.txt actresses.txt > 2010.txt"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}