Skip to content

Instantly share code, notes, and snippets.

@wesyoung
Created April 26, 2017 19:21
Show Gist options
  • Save wesyoung/483756f102d6f8e16c18b70465f471d8 to your computer and use it in GitHub Desktop.
Save wesyoung/483756f102d6f8e16c18b70465f471d8 to your computer and use it in GitHub Desktop.
import magic
import sys
from pprint import pprint
f = sys.argv[1]
def _is_ascii(f, mime):
if mime == 'ASCII text':
return True
def _is_rss(f, mime):
if not _is_xml(f, mime):
return
def _is_xml(f, mime):
if mime == 'XML document text':
return 'xml'
def _is_json(f, mime):
if not _is_ascii(f, mime):
return
first = f.readline().rstrip("\n")
last = first
try:
last = f.readlines()[-1].rstrip("\n")
except Exception:
pass
if not (first.startswith("'[{") or first.startswith("'{")):
return
if not (last.endswith("}]'") or last.endswith("}'")):
return
return 'json'
def _is_delimited(f, mime):
if not _is_ascii(f, mime):
return
m = {
"\t": 'tsv',
',': 'csv',
'|': 'pipe',
';': 'semicolon'
}
first = f.readline().rstrip("\n")
while first.startswith('#'):
first = f.readline().rstrip("\n")
second = f.readline().rstrip("\n")
for d in m:
c = first.count(d)
if c == 0:
continue
if second.count(d) == c:
return m[d]
return False
TESTS = [
_is_delimited,
_is_json,
_is_xml,
]
def feed_type(f, mime):
t = None
for tt in TESTS:
t = tt(f, mime)
if t:
return t
with open(f) as FILE:
mime_type = magic.from_file(f)
print(mime_type)
print(feed_type(FILE, mime_type))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment