Created
December 1, 2015 10:37
-
-
Save y16ra/1526ec9a479d589ef71f to your computer and use it in GitHub Desktop.
slackからexportしたデータをMongoDBに取り込む
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf_8 -*- | |
import json | |
import sys, os, datetime | |
import pymongo | |
# sysモジュールをリロードする | |
reload(sys) | |
# デフォルトの文字コードを変更する. | |
sys.setdefaultencoding('utf_8') | |
# print sys.stdout.encoding | |
# print sys.getdefaultencoding() | |
BASE_DIR = '/Users/WORKDIR_PATH' | |
# User info reads from json file | |
with open(BASE_DIR + 'users.json', 'r') as f: | |
userData = json.load(f) | |
users_dict = {} | |
for user in userData: | |
users_dict.update({user['id']:user['name']}) | |
# mongodb へのアクセスを確立 | |
client = pymongo.MongoClient('localhost', 27017) | |
# データベースを作成 | |
db = client.slack_data | |
for root, dirs, files in os.walk(BASE_DIR): | |
for file_ in files: | |
full_path = os.path.join(root, file_) | |
channel_name = full_path.replace(BASE_DIR, '').replace('/' + os.path.basename(full_path), '') | |
# MongoDBのコレクションをチャンネル名で作成 | |
col = db[channel_name] | |
filename, ext = os.path.splitext(os.path.basename(full_path)) | |
print 'now processing... :' + filename | |
exclude_files = [ | |
# 'users', | |
'channels', | |
'integration_logs' | |
] | |
if ext != '.json' or filename in exclude_files: | |
continue | |
# read slack data files | |
with open(full_path, 'r') as f: | |
jsonData = json.load(f) | |
# show post data | |
exclude_post_type = [ | |
'channel_join', | |
'channel_purpose' | |
] | |
for data in jsonData: | |
if 'subtype' in data and data['subtype'] in exclude_post_type: | |
continue | |
if filename == 'users': | |
post_id = col.update(data, data, upsert=True) | |
continue | |
# print json.dumps(data, indent=4) | |
post_data = { | |
'json_filename': filename, | |
'user_id': data['user'] if 'user' in data else data['username'] if 'username' in data else 'none', | |
'user': users_dict[data['user']] if 'user' in data and data['user'] in users_dict else data['username'] if 'username' in data else 'none', | |
'text': data['text'] if 'text' in data else 'none', | |
'ts': datetime.datetime.fromtimestamp(float(data['ts'])).strftime("%Y-%m-%d %H:%M:%S"), | |
'raw': data | |
} | |
col.update(post_data, post_data, upsert=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment