Skip to content

Instantly share code, notes, and snippets.

@pandanote-info
Last active December 9, 2020 23:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pandanote-info/010a1b16450cf18f2d0d89e92d63271c to your computer and use it in GitHub Desktop.
Save pandanote-info/010a1b16450cf18f2d0d89e92d63271c to your computer and use it in GitHub Desktop.
matplotlibとProcessPoolExecutorを使用して動画用の画像を並列処理で作成するためのPython3のプログラム。
#!/usr/bin/python3
#
# See https://pandanote.info/?p=6970 for details.
#
import datetime
import math
import json
import numpy as np
from scipy.sparse import lil_matrix
import io, sys
import argparse
import time
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D
import os
import concurrent.futures as cf
def generate3dimage(ab, dim, verbose, outfile, colbar_label, labels, fontsize = 1.5, scale = 1.0, view = None):
sns.set_context("paper", font_scale=fontsize)
# plt.rcParams.update({'figure.max_open_warning': 0})
plt.figure(figsize=(16, 12))
ax = plt.axes(projection='3d')
# ax.set(xlabel='word(ID)',ylabel='Articles at https://pandanote.info/ (ID)',zlabel='Importance')
print(outfile)
if verbose == 1:
print(labels)
ax.set(**labels)
if view:
ax.view_init(elev=view[0],azim=view[1])
_X,_Y = np.meshgrid(np.arange(dim[0]),np.arange(dim[0]));
dh = np.ones(dim[0]*dim[0])/2;
x,y = _X.ravel()-dh, _Y.ravel()-dh
Z = np.zeros_like(ab.toarray().ravel())
dx = 1
dy = 1
dz = ab.toarray().ravel()
if verbose == 1:
print(x.shape)
print(y.shape)
print(Z.shape)
#print(dx.shape)
#print(dy.shape)
print(dz.shape)
norm = plt.Normalize(0,dz.max())
colors = plt.cm.terrain_r(norm(dz))
ax.set_zlim(0,dz.max())
ax.bar3d(x,y,Z,dx,dy,(dz*scale) if abs(scale) > 1e-10 else np.zeros_like(ab.toarray().ravel()),color=colors)
colourMap = plt.cm.ScalarMappable(cmap=plt.cm.terrain_r)
colourMap.set_array(dz)
colBar = plt.colorbar(colourMap).set_label(colbar_label)
plt.savefig(outfile)
plt.close('all')
def calculate_cosine_similarity(aa,articleids,resultout,statsout):
aadim = aa.shape
ab = lil_matrix((aadim[0],aadim[0]))
al = []
ast = []
for i in range(0,aadim[0]):
tmp = aa[i].toarray().ravel()
al = np.append(al,math.sqrt(np.dot(tmp,tmp)))
for i in range(0,aadim[0]):
for j in range(i+1,aadim[0]):
ab[i,j] = np.dot(aa[i].toarray().ravel(),aa[j].toarray().ravel())/al[i]/al[j]
ast.append(((articleids[i],articleids[j]),ab[i,j]))
with open(resultout,mode='w') as f:
f.write(json.dumps(ab.toarray().tolist()))
f.close()
with open(statsout,mode='w') as f:
f.write(json.dumps(sorted(ast,key=lambda x:x[1], reverse=True)))
f.close()
return ab
if __name__ == '__main__':
start = time.time()
sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
parser = argparse.ArgumentParser(description='Command line options of freqreadtest.py')
parser.add_argument('-i','--input-file', type=str, default='../../scala/pandacorpus/pandanote_frequencies_result_20201107211207.txt', help='Input file in JSON format')
parser.add_argument('-v','--verbose', help='Increase output verbosity.', nargs='?', default=0, const=0, type=int)
parser.add_argument('-t','--three-dimension', help='Output as 3D chart.', action="store_true")
parser.add_argument('-m','--movie', help='Create files for 3D chart movie.', action="store_true")
parser.add_argument('-p','--parallel', help='Create files for 3D chart in parallel.', action="store_true")
parser.add_argument('-g','--grow-bars', help='Grows bar before rotate.', nargs='?', default=0.1, const=0.1, type=float)
parser.add_argument('-d','--output-dir', help='Output directory for 3D chart.', type=str)
parser.add_argument('-c','--cosine-similarity', help='Calculate cosine similarity between two articles.', action="store_true")
parser.add_argument('-f','--font-scale', help='Size of font as a scale value.', nargs='?', type=float)
args = parser.parse_args()
print(args)
params = vars(args)
inputfile = params['input_file']
verbose = params['verbose']
three_dimension = params['three_dimension']
movie = params['movie']
parallel = params['parallel']
grow_bars = params['grow_bars']
output_dir = params['output_dir']
cosine_similarity = params['cosine_similarity']
fontsize = params['font_scale'] if params['font_scale'] else 1.5
with open(inputfile, encoding='utf-8') as fh:
freqlist = json.load(fh)
words = []
articleids = []
for k,v in freqlist.items():
words.append(k)
for vv in v:
a,f = vv.split(",")
aa = int(a)
if aa not in articleids:
articleids.append(aa)
articleids.sort()
alen = len(articleids)
wlen = len(words)
bow = lil_matrix((alen,wlen))
for k,v in freqlist.items():
wi = words.index(k)
for vv in v:
a,f = map(int,vv.split(","))
bow[articleids.index(a),wi] = f
#print(bow)
#print(words)
#print(articleids)
nzelemnum = len(bow.nonzero()[0])
print("Matrix size: {0:d}x{1:d}, Number of element which has non-zero value: {2:d}".format(alen,wlen,nzelemnum))
print("Rate of element which has non-zero value: {0:f}%".format(100.0*nzelemnum/alen/wlen))
dim = bow.shape
# TF
if verbose == 1:
print("-- TF")
aa = bow.copy()
if verbose > 1:
np.set_printoptions(threshold=np.inf,formatter={'float': '{:.8f}'.format})
for i in range(0,dim[0]):
ar = bow.getrow(i)
rowsum = np.matrix.sum(ar.todense())
arr = ar/rowsum
aa[i] = arr
if verbose == 1:
print(aa.todense())
# IDF(ln)
if verbose == 1:
print("-- TF-IDF(in)")
for j in range(0,dim[1]):
ac = aa.getcol(j)
idf = math.log(dim[0]/ac.getnnz())
aa[0:dim[0],j] = ac*idf
if verbose == 1:
print(aa.toarray())
datetimestr = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
chartprefix = 'freqreadtest'
if cosine_similarity:
chartprefix = 'freqread_cosine_similarity'
ab = calculate_cosine_similarity(aa,articleids,(output_dir+"/" if (output_dir) else "")+chartprefix+'_'+datetimestr+'.json',(output_dir+"/" if (output_dir) else "")+chartprefix+'_stats_'+datetimestr+'.json',)
labels = {'xlabel':'Articles at https://pandanote.info/ (ID): column','ylabel':'Articles at https://pandanote.info/ (ID): row','zlabel':'Similarity'}
colbar_label = 'Similarity'
else:
ab = aa[0:dim[0],0:dim[0]]
labels = {'xlabel':'word(ID)','ylabel':'Articles at https://pandanote.info/ (ID)','zlabel':'Importance'}
colbar_label = 'Importance'
filename = (output_dir+"/" if (output_dir) else "")+chartprefix+'_'+datetimestr+'.png'
if not three_dimension:
sns.set_context("paper", font_scale=fontsize)
plt.figure(figsize=(16, 12))
ax = sns.heatmap(ab.toarray(),cmap="terrain_r" if cosine_similarity else "bone_r") # Blues,terrain,CMRmap_r
if verbose == 1:
print(sns.color_palette())
if cosine_similarity:
ax.set(xlabel='Articles at https://pandanote.info/ (ID): column',ylabel='Articles at https://pandanote.info/ (ID): row')
else:
ax.set(xlabel='word(ID)',ylabel='Articles at https://pandanote.info/ (ID)')
ax.xaxis.set_ticks_position('top')
ax.xaxis.set_label_position('top')
ax.xaxis.set_major_locator(ticker.MultipleLocator(20))
ax.xaxis.set_major_formatter(FormatStrFormatter("%d"))
ax.yaxis.set_major_locator(ticker.MultipleLocator(20))
ax.yaxis.set_major_formatter(FormatStrFormatter("%d"))
plt.savefig(filename)
else:
sns.set_context("paper", font_scale=fontsize)
if movie:
if grow_bars:
print("Grow rate: {0:f}".format(grow_bars))
filenameprefix = (output_dir+"/" if (output_dir) else "")+chartprefix+'_3d_grow_bars_'+datetimestr
print("Filename prefix for growing bars: {0:s}".format(filenameprefix))
if parallel:
with cf.ProcessPoolExecutor(max_workers=os.cpu_count()) as e:
interval = 1
args = [(ab,dim,verbose,filenameprefix+"_{0:04d}.png".format(int(i//grow_bars)),colbar_label,labels,fontsize,i,(30,0)) for i in np.linspace(0, 1, num=int(1.0/grow_bars))]
e.map(generate3dimage,*zip(*args))
#print(list(results))
else:
count = 0
for i in np.linspace(0,1.0,num=int(1.0/grow_bars)):
generate3dimage(ab,dim,verbose,filenameprefix+"_{0:04d}.png".format(count),colbar_label,labels,fontsize,i,(30,0))
count = count + 1
filenameprefix = (output_dir+"/" if (output_dir) else "")+chartprefix+'_3d_'+datetimestr
print("Filename prefix for rotating plots: {0:s}".format(filenameprefix))
angle_of_rotation = 360
if parallel:
with cf.ProcessPoolExecutor(max_workers=os.cpu_count()) as e:
interval = 1
args = [(ab,dim,verbose,filenameprefix+"_{0:04d}.png".format(int(i/interval)),colbar_label,labels,fontsize,1.0,(30,i)) for i in range(0,angle_of_rotation+1,interval)]
e.map(generate3dimage,*zip(*args))
#print(list(results))
else:
count = 0
for i in range(0,angle_of_rotation+1):
generate3dimage(ab,dim,verbose,filenameprefix+"_{0:04d}.png".format(count),colbar_label,labels,fontsize,1.0,(30,i))
count = count + 1
else:
filename = (output_dir+"/" if (output_dir) else "")+chartprefix+'_3d_'+datetime.datetime.now().strftime('%Y%m%d%H%M%S')+'.png'
generate3dimage(ab,dim,verbose,filename,colbar_label,labels,fontsize)
print("Output file: {0:s}".format(filename))
elapsed_time = time.time() - start
print("Elapsed time:{0:f} sec.".format(elapsed_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment