Skip to content

Instantly share code, notes, and snippets.

@Ikuyadeu
Last active December 18, 2017 18:33
Show Gist options
  • Save Ikuyadeu/211dd4fc8f9d63644583ce442fcd712b to your computer and use it in GitHub Desktop.
Save Ikuyadeu/211dd4fc8f9d63644583ce442fcd712b to your computer and use it in GitHub Desktop.
Get Changed file list from database

Usage to get data

Get DB

Get from http://kin-y.github.io/miningReviewRepo/

Run Python

python3 GetFileList.py gm_openstack user passwd
mkdir revision_files
python3 RequestFileDiff.py gm_openstack https://review.openstack.org start end --from-ini
#!/usr/bin/env python3
"""
Get file list from mysql
Usage:
$ python3 src/GetFileList.py gm_openstack user passwd
Output:
./gm_openstack.csv
- "ch_id": Change id
- "rev_id": Revision id
- "f_file_name": Encoded file path
"""
import sys
import csv
from urllib.parse import quote_plus
from collections import defaultdict
import MySQLdb
def main():
"""
Main
"""
# set argument
argv = sys.argv
argc = len(argv)
if argc == 4:
current_db = argv[1]
user = argv[2]
passwd = argv[3]
else:
current_db = "gm_openstack"
user = "root"
passwd = ""
# Define dictionary
t_revision_dic = defaultdict(lambda: [])
t_file_dic = defaultdict(lambda: [])
# Connect DB
connection = MySQLdb.connect(db=current_db, user=user, passwd=passwd)
cursor = connection.cursor()
# Get changes
sys.stdout.write("\rCollecting changes...")
sql = "SELECT id, ch_Id, ch_changeId \
FROM t_change"
cursor.execute(sql)
changes = cursor.fetchall()
# Get revisions
sys.stdout.write("\rCollecting revisions...")
sql = "SELECT id, rev_Id, rev_changeId, rev_patchSetNum \
FROM t_revision"
cursor.execute(sql)
revisions = cursor.fetchall()
# Get files
sys.stdout.write("\rCollecting files...")
sql = "SELECT f_fileName, f_revisionId \
FROM t_file"
cursor.execute(sql)
files = cursor.fetchall()
# Close DB connection
connection.close()
# Store data into t_revisionDic
for revision in revisions:
t_revision_dic[revision[2]].append(revision)
for rev_file in files:
t_file_dic[int(rev_file[1])].append(rev_file)
# File list for output
output_files = []
# Search from changes
changes_len = len(changes)
for i, change in enumerate(changes):
ch_revisions = t_revision_dic[change[0]]
ch_id = change[1]
ch_change_id = change[2]
revisions_len = len(ch_revisions)
# Search from revisions
for j, revision in enumerate(ch_revisions):
rev_files = t_file_dic[revision[0]]
rev_id = revision[1]
rev_change_id = revision[2]
rev_patch_set_num = revision[3]
output_files += [[ch_id, ch_change_id,
rev_id, rev_change_id,
quote_plus(rev_file[0]), rev_patch_set_num]
for rev_file in rev_files]
sys.stdout.write("\rChange: %d / %d, Revision: %d / %d" %
(i, changes_len, j, revisions_len))
# Output
with open(current_db + ".csv", 'w') as csvfile:
writer = csv.writer(csvfile, lineterminator='\n')
sys.stdout.write("\rOutputting files...")
writer.writerow(["ch_id", "ch_change_id",
"rev_id", "rev_change_id",
"f_file_name", "rev_patchSetNum"])
writer.writerows(output_files)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
"""
Get file revised from csv
"""
from csv import DictReader
from sys import argv, stdout
from os import mkdir, path, error
from time import sleep
from requests import get, exceptions
USAGE = "Usage: python3 src/RequestFileDiff.py current_db requests_header start end\
[--from-ini] [--from-prev]"
FROM_BASE = 0
FROM_INI = 1
FROM_PREV = 2
def main():
"""
Main
"""
base_mode = FROM_BASE
if "--from-ini" in argv:
base_mode = FROM_INI
argv.remove("--from-ini")
elif "--from-prev" in argv:
base_mode = FROM_PREV
argv.remove("--from-prev")
if len(argv) != 5 or "-h" in argv or "--help" in argv:
print(USAGE)
return
# Set argument
current_db = argv[1]
requests_header = argv[2] # exp) https://review.openstack.org
start = int(argv[3])
end = int(argv[4])
# Make project's directory
projects_path = "./revision_files/" + current_db
if not path.exists(projects_path):
mkdir(projects_path)
with open(current_db + ".csv", 'r') as csvfile:
reader = DictReader(csvfile, lineterminator='\n')
for i, rev_file in enumerate(reader, start=1):
if i >= start:
break
for i, rev_file in enumerate(reader, start=start):
if i > end:
break
f_file_name = str(rev_file["f_file_name"])
rev_patch_set_num = str(rev_file["rev_patchSetNum"])
requests_url = "/".join([requests_header,
"changes", str(rev_file["ch_id"]),
"revisions", rev_patch_set_num,
"files", f_file_name,
"diff"])
params = make_param_from(int(rev_patch_set_num), base_mode)
for _ in range(1, 5):
try:
response = get(requests_url, params=params)
if response.status_code != 200:
print("\n" + str(i) + ": " + requests_url + " "+ str(response.status_code))
if response.status_code == 404:
break
sleep(30)
continue
except exceptions.RequestException as err:
print("\n" + str(i) + ": " + str(err))
sleep(30)
else:
break
response.encoding = 'utf-8'
# Output
revisions_path = "/".join([projects_path, rev_file["rev_id"]])
if not path.exists(revisions_path):
mkdir(revisions_path)
try:
with open("/".join([revisions_path, f_file_name + ".json"]), 'w') as rev_file:
rev_file.write(response.text)
except error:
print("\nOS Error")
continue
stdout.write("\rFile: %d / %d" % (i, end))
def make_param_from(rev_patch_set_num, base_mode):
"""
Return requests parameter
"""
if rev_patch_set_num == 1 or base_mode == FROM_BASE:
return None
elif base_mode == FROM_INI:
return {"base": "1"}
elif base_mode == FROM_PREV:
return {"base": str(rev_patch_set_num-1)}
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment