Skip to content

Instantly share code, notes, and snippets.

@moschlar
Created August 14, 2012 08:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save moschlar/3347601 to your computer and use it in GitHub Desktop.
Save moschlar/3347601 to your computer and use it in GitHub Desktop.
Get file mime type with python-magic
#!/usr/bin/env python2
import os, sys
from magic import Magic
from collections import defaultdict
from itertools import groupby
from pprint import pprint
m = Magic(mime=True)
me = Magic(mime_encoding=True)
paths = sys.argv[1:]
types = []
for path in paths:
for (root, dirs, files) in os.walk(path):
print >> sys.stderr, root
try: dirs.remove('.svn')
except ValueError: pass
for f in files:
t = me.from_file(os.path.join(root, f))
e = os.path.splitext(f)[1]
types.append((e, t))
tt = defaultdict(lambda: defaultdict(int))
te = defaultdict(lambda: defaultdict(int))
for e, t in types:
tt[t][e] += 1
te[e][t] += 1
# Prettyprint
for k in tt: tt[k] = dict(tt[k])
for k in te: te[k] = dict(te[k])
pprint(dict(tt))
pprint(dict(te))
{None: {'.version': 2},
'Composite Document File V2 Document, No summary infobinary': {'.db': 1},
'application/mswordbinary': {'.doc': 34, '.ppt': 7, '.pub': 4},
'application/vnd.ms-excelbinary': {'.xls': 4},
'binary': {'': 14,
'.1': 24,
'.AVI': 1,
'.JPG': 1191,
'.MOV': 9,
'.PNG': 16,
'.THM': 1,
'.ap_': 1,
'.apk': 1,
'.bbl': 2,
'.blg': 2,
'.c': 1,
'.class': 220,
'.dat': 4,
'.dex': 1,
'.dia': 1,
'.docx': 26,
'.epub': 5,
'.exe': 23,
'.flv': 1,
'.gif': 64,
'.gz': 8,
'.ico': 2,
'.index': 1,
'.jar': 9,
'.java': 1,
'.jpeg': 8,
'.jpg': 594,
'.key': 6,
'.lck': 4,
'.m4a': 16,
'.m4v': 19,
'.mobi': 1,
'.mov': 1,
'.mp3': 89,
'.mp3 Alias': 1,
'.mp4': 5,
'.o': 2,
'.obj': 1,
'.odp': 2,
'.ods': 2,
'.odt': 12,
'.out': 1,
'.pc2': 3,
'.pdf': 873,
'.pkt': 1,
'.pl': 1,
'.png': 101,
'.pptx': 2,
'.properties': 2,
'.psd': 2,
'.rar': 10,
'.resources': 1,
'.sav': 1,
'.snm': 1,
'.tbz': 3,
'.tif': 15,
'.tree': 1,
'.txt': 1,
'.vpp': 2,
'.vpp~1': 1,
'.vpp~10': 1,
'.vpp~11': 1,
'.vpp~12': 1,
'.vpp~2': 1,
'.vpp~3': 1,
'.vpp~5': 1,
'.vpp~6': 1,
'.xcf': 2,
'.xlsx': 8,
'.zip': 74},
'iso-8859-1': {'.bib': 1,
'.c': 10,
'.eml': 1,
'.html': 1,
'.java': 72,
'.log': 2,
'.pl': 2,
'.properties': 1,
'.tex': 6,
'.txt': 26},
'unknown-8bit': {'.pdf': 1, '.txt': 2},
'us-ascii': {'': 54,
'.BAK': 1,
'.TXT': 1,
'.arff': 1,
'.aux': 5,
'.axd': 2,
'.bak': 6,
'.bat': 2,
'.bkup': 1,
'.c': 22,
'.cfg': 1,
'.cmd': 1,
'.css': 3,
'.csv': 1,
'.cu': 1,
'.dat': 3,
'.doc': 1,
'.eps': 2,
'.form': 1,
'.history': 1,
'.html': 21,
'.in': 2,
'.index': 1,
'.ini': 4,
'.java': 160,
'.jff': 5,
'.js': 2,
'.log': 8,
'.mf': 5,
'.mw': 8,
'.mws': 1,
'.nav': 1,
'.orig': 1,
'.out': 2,
'.pc2': 1,
'.pdf': 5,
'.php': 6,
'.pkgbkup': 10,
'.pl': 42,
'.prefs': 17,
'.private_key': 1,
'.prj': 1,
'.properties': 18,
'.py': 1,
'.rkt': 7,
'.rs': 1,
'.rtf': 1,
'.sh': 2,
'.sql': 5,
'.ss': 10,
'.svg': 7,
'.tex': 1,
'.toc': 1,
'.tps': 1,
'.txt': 35,
'.ucd': 5,
'.uml': 2,
'.ump': 2,
'.url': 1,
'.xml': 43},
'utf-8': {'': 1,
'.BAK': 8,
'.bak': 6,
'.c': 12,
'.css': 3,
'.dat': 1,
'.html': 5,
'.java': 55,
'.jff': 1,
'.js': 3,
'.php': 11,
'.pl': 20,
'.py': 1,
'.rkt': 6,
'.sql': 4,
'.ss': 2,
'.tex': 16,
'.txt': 5,
'.xml': 7}}
{'': {'binary': 14, 'us-ascii': 54, 'utf-8': 1},
'.1': {'binary': 24},
'.AVI': {'binary': 1},
'.BAK': {'us-ascii': 1, 'utf-8': 8},
'.JPG': {'binary': 1191},
'.MOV': {'binary': 9},
'.PNG': {'binary': 16},
'.THM': {'binary': 1},
'.TXT': {'us-ascii': 1},
'.ap_': {'binary': 1},
'.apk': {'binary': 1},
'.arff': {'us-ascii': 1},
'.aux': {'us-ascii': 5},
'.axd': {'us-ascii': 2},
'.bak': {'us-ascii': 6, 'utf-8': 6},
'.bat': {'us-ascii': 2},
'.bbl': {'binary': 2},
'.bib': {'iso-8859-1': 1},
'.bkup': {'us-ascii': 1},
'.blg': {'binary': 2},
'.c': {'binary': 1, 'iso-8859-1': 10, 'us-ascii': 22, 'utf-8': 12},
'.cfg': {'us-ascii': 1},
'.class': {'binary': 220},
'.cmd': {'us-ascii': 1},
'.css': {'us-ascii': 3, 'utf-8': 3},
'.csv': {'us-ascii': 1},
'.cu': {'us-ascii': 1},
'.dat': {'binary': 4, 'us-ascii': 3, 'utf-8': 1},
'.db': {'Composite Document File V2 Document, No summary infobinary': 1},
'.dex': {'binary': 1},
'.dia': {'binary': 1},
'.doc': {'application/mswordbinary': 34, 'us-ascii': 1},
'.docx': {'binary': 26},
'.eml': {'iso-8859-1': 1},
'.eps': {'us-ascii': 2},
'.epub': {'binary': 5},
'.exe': {'binary': 23},
'.flv': {'binary': 1},
'.form': {'us-ascii': 1},
'.gif': {'binary': 64},
'.gz': {'binary': 8},
'.history': {'us-ascii': 1},
'.html': {'iso-8859-1': 1, 'us-ascii': 21, 'utf-8': 5},
'.ico': {'binary': 2},
'.in': {'us-ascii': 2},
'.index': {'binary': 1, 'us-ascii': 1},
'.ini': {'us-ascii': 4},
'.jar': {'binary': 9},
'.java': {'binary': 1, 'iso-8859-1': 72, 'us-ascii': 160, 'utf-8': 55},
'.jff': {'us-ascii': 5, 'utf-8': 1},
'.jpeg': {'binary': 8},
'.jpg': {'binary': 594},
'.js': {'us-ascii': 2, 'utf-8': 3},
'.key': {'binary': 6},
'.lck': {'binary': 4},
'.log': {'iso-8859-1': 2, 'us-ascii': 8},
'.m4a': {'binary': 16},
'.m4v': {'binary': 19},
'.mf': {'us-ascii': 5},
'.mobi': {'binary': 1},
'.mov': {'binary': 1},
'.mp3': {'binary': 89},
'.mp3 Alias': {'binary': 1},
'.mp4': {'binary': 5},
'.mw': {'us-ascii': 8},
'.mws': {'us-ascii': 1},
'.nav': {'us-ascii': 1},
'.o': {'binary': 2},
'.obj': {'binary': 1},
'.odp': {'binary': 2},
'.ods': {'binary': 2},
'.odt': {'binary': 12},
'.orig': {'us-ascii': 1},
'.out': {'binary': 1, 'us-ascii': 2},
'.pc2': {'binary': 3, 'us-ascii': 1},
'.pdf': {'binary': 873, 'unknown-8bit': 1, 'us-ascii': 5},
'.php': {'us-ascii': 6, 'utf-8': 11},
'.pkgbkup': {'us-ascii': 10},
'.pkt': {'binary': 1},
'.pl': {'binary': 1, 'iso-8859-1': 2, 'us-ascii': 42, 'utf-8': 20},
'.png': {'binary': 101},
'.ppt': {'application/mswordbinary': 7},
'.pptx': {'binary': 2},
'.prefs': {'us-ascii': 17},
'.private_key': {'us-ascii': 1},
'.prj': {'us-ascii': 1},
'.properties': {'binary': 2, 'iso-8859-1': 1, 'us-ascii': 18},
'.psd': {'binary': 2},
'.pub': {'application/mswordbinary': 4},
'.py': {'us-ascii': 1, 'utf-8': 1},
'.rar': {'binary': 10},
'.resources': {'binary': 1},
'.rkt': {'us-ascii': 7, 'utf-8': 6},
'.rs': {'us-ascii': 1},
'.rtf': {'us-ascii': 1},
'.sav': {'binary': 1},
'.sh': {'us-ascii': 2},
'.snm': {'binary': 1},
'.sql': {'us-ascii': 5, 'utf-8': 4},
'.ss': {'us-ascii': 10, 'utf-8': 2},
'.svg': {'us-ascii': 7},
'.tbz': {'binary': 3},
'.tex': {'iso-8859-1': 6, 'us-ascii': 1, 'utf-8': 16},
'.tif': {'binary': 15},
'.toc': {'us-ascii': 1},
'.tps': {'us-ascii': 1},
'.tree': {'binary': 1},
'.txt': {'binary': 1,
'iso-8859-1': 26,
'unknown-8bit': 2,
'us-ascii': 35,
'utf-8': 5},
'.ucd': {'us-ascii': 5},
'.uml': {'us-ascii': 2},
'.ump': {'us-ascii': 2},
'.url': {'us-ascii': 1},
'.version': {None: 2},
'.vpp': {'binary': 2},
'.vpp~1': {'binary': 1},
'.vpp~10': {'binary': 1},
'.vpp~11': {'binary': 1},
'.vpp~12': {'binary': 1},
'.vpp~2': {'binary': 1},
'.vpp~3': {'binary': 1},
'.vpp~5': {'binary': 1},
'.vpp~6': {'binary': 1},
'.xcf': {'binary': 2},
'.xls': {'application/vnd.ms-excelbinary': 4},
'.xlsx': {'binary': 8},
'.xml': {'us-ascii': 43, 'utf-8': 7},
'.zip': {'binary': 74}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment