gyosit/gene.py

## gene.py
class Gene:
  def __init__(self, num, mins, maxs):
    one_gene = np.ones(num)
    set_gene = np.stack([one_gene, mins, maxs], axis=1)
    init_gene = [x[0] * random.uniform(x[1], x[2]) for x in set_gene]
    self.gene = init_gene
    self.num = num
    self.mins = mins
    self.maxs = maxs
    self.fitness = 0

  @classmethod
  def getFitness(self, genes, model):
    x = np.array([g.gene for g in genes]).reshape(-1, genes[0].num)
    predicted = model.predict(x)
    for i, p in enumerate(predicted):
      genes[i].fitness = p

  def getFitness_(self):
    self.fitness = self.gene[0] - self.gene[1]
    return self.fitness

  def mutation(self, p=0.2):
    r = random.randint(0, 10)
    if(r <= p*10):
      one_gene = np.ones(self.num)
      set_gene = np.stack([one_gene, self.mins, self.maxs], axis=1)
      init_gene = [x[0] * random.uniform(x[1], x[2]) for x in set_gene]
      rand_i = random.randint(0, self.num-1)
      self.gene[rand_i] = init_gene[rand_i]

  @classmethod
  def sortGene(self, genes, model):
    self.getFitness(genes, model)
    genes = sorted(genes, key=lambda g: g.fitness, reverse=True)
    return genes

  @classmethod
  def select(self, genes):
    l = int(len(genes)/4)
    try:
      return genes[:l]
    except TypeError:
      print("Removed the last gene")
      genes.pop()
      return genes[:l]

  def cross(self, g1, g2):
    rand_i = random.randint(1, g1.num-1)
    g1_, g2_ = Gene(g1.num, g1.mins, g1.maxs), Gene(g1.num, g1.mins, g1.maxs)
    f1, b1= g1.gene[:rand_i], g1.gene[rand_i:]
    f2, b2= g2.gene[:rand_i], g2.gene[rand_i:]
    g1_.gene, g2_.gene = f1 + b2, f2 + b1
    return [g1, g2, g1_, g2_]

  @classmethod
  def crossover(self, genes):
    new_genes = []
    l = len(genes)
    while(len(genes)*4 > len(new_genes)):
      r1, r2 = random.randint(0,l-1), random.randint(0,l-1)
      family = self.cross(self, genes[r1], genes[r2])
      new_genes += family
    return new_genes

## graphic.py
class Graphics():
  def dayPlot2D(self, startday, y, title, xlabel, ylabel):
    row = y[1].shape[0]
    col = 2

    data_range = pd.date_range(startday, periods=y[1].shape[1], freq='d')
    plt.rcParams["font.size"] = 12

    fig = plt.figure(figsize=(15,20))

    for i in range(y[0].shape[0]):
      axL = fig.add_subplot(row, col, 1+i*2)
      axL.plot(data_range, y[0][i], linewidth=2)
      if(i==0): axL.set_title(title[0], fontsize=18)
      axL.set_xlabel(xlabel[0], fontsize=18)
      axL.set_ylabel(ylabel[0][i], fontsize=18)
      axL.grid(True)

      axR = fig.add_subplot(row, col, 2+i*2)
      axR.plot(data_range, y[1][i], linewidth=2)
      if(i==0): axR.set_title(title[1], fontsize=18)
      axR.set_xlabel(xlabel[1], fontsize=18)
      axR.set_ylabel(ylabel[1][i], fontsize=18)
      axR.grid(True)

    for ax in fig.axes:
      plt.sca(ax)
      plt.xticks(rotation=30)

    fig.show()

    return fig

## main.py
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.animation as animation
import random
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata
import pandas as pd
from keras import regularizers
import datetime
import random as rnd
from sklearn.metrics import r2_score
import math
from tqdm import tqdm
import copy
from sklearn.metrics import mean_squared_error

#Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import pydotplus as pdp
import pickle
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

if __name__ == '__main__':
  rf = RF()
  df = pd.read_csv('kyuri_liquid_202001-06.csv') # file name
  df_ls, df_vs = rf.split(df=df, learning=0.8, fold=5)
  #print(np.sort(df_ls[0].values[:,0]))
  #print(np.sort(df_v.values[:,0]))

  x_h_c = ["radiation"
           , "temp"
           , "day_temp"
           , "night_temp"
           , "day_satiety"
           , "night_satiety"] # inputs (climate)
  """
  # For net_photo
  x_h_c = ["radiation"
          , "night_temp"
          , "temp"
          , "day_temp"]

  # For net_photo
  x_h_c = ["radiation"
          , "temp"
          , "day_temp"]
  """


  x_h = x_h_c
  y_h = ["net_photo"] # outputs

  for j in range(len(df_ls)):
    df_l = df_ls[j]
    df_v = df_vs[j]
    print(df_l.values[:,0])
    x = rf.prepare(df_l, x_h)

    # Training
    forests = []
    for targ in y_h:
      forest, indices, importances = rf.machineL(x, df_l[targ], targ, 0) # x, y
      forests.append(forest)
      #rf.showRF(forests[0])
      # Validation
      #rf.validation(forests, df_v, x_h_c, y_h, j)
      #rf.gradation(forests[0], [20.0, 25.0, -1, -2, 6.0, 2.0], np.arange(11)*4, np.arange(11)*4)
      print("-----")

    # ---High contributing
    high_cont = False
    if(high_cont):
      hc_x_h = []
      for i, hc in enumerate(indices):
        hc_x_h.append(x_h_c[hc])
        x = rf.prepare(df_l, hc_x_h)
        forests = []
        for targ in y_h:
          forest, indices, importances = rf.machineL(x, df_l[targ], targ, j) # x, y
          forest = pickle.load(open(targ+str(j)+'.sav', 'rb'))
          forests.append(forest)
          print("-")
        #print(rf.validation(forests, df_v, hc_x_h, y_h, j))
        print("-----")

    """
    genes = [Gene(6, [5,20,28,14,5,1],[20,30,33,26,10,4]) for i in range(10000)]
    for i in range(100):
      [g.mutation() for g in genes]
      genes = Gene.sortGene(genes, forests[0])
      print(i, genes[0].gene, genes[0].fitness)
      genes = Gene.select(genes)
      genes = Gene.crossover(genes)
    """

## rf.py
class RF:
  def split(self, df, learning=0.8, fold=5):
    #l = int(df.shape[0]*learning)
    #df_l, df_v = train_test_split(df, train_size=learning)
    #df_l, df_v = df[:l], df[l:]
    kf = KFold(n_splits=fold, shuffle=False, random_state=0)
    df_l, df_v = [], []
    for i, (train_index, test_index) in enumerate(kf.split(df)):
      df_l.append([]), df_v.append([])
      df_l[i]=df.iloc[train_index]
      df_v[i]=df.iloc[test_index]
    return df_l, df_v

  def prepare(self, df, header):
    integrated = np.stack([df[h] for h in header])
    integrated = integrated.T
    return integrated

  def machineL(self, x, y, target, j):
    #モデル
    forest = RandomForestRegressor(n_estimators=1000, n_jobs = -1)
    f = forest.fit(x, y)
    pickle.dump(forest, open('./'+target+str(j)+'.sav','wb'))
    #print("score:", f.score(t_x, t_y))

    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    for f in range(x.shape[1]):
      print("%d. feature %d (%f)" % (f+1, indices[f], importances[indices[f]]))

    return forest, indices, importances

  def validation(self, forests, df, x_h_c, y_h, j):
    #評価
    l = str(len(x_h_c))

    x_c = rf.prepare(df, x_h_c) # climate data
    y = rf.prepare(df, y_h) # Actual data
    hist_p = []
    hist_y = []

    i = 0
    for eachx_c in x_c:
      x = eachx_c
      predicted_p = np.empty(0)
      for forest in forests:
        forsee = forest.predict(x.reshape(1,-1))
        if(abs(forsee) < 0.0001):
          forsee = 0
        predicted_p = np.append(predicted_p, forsee)
      #x_p += predicted_x # integrated
      #y_ += y[i] # integrated
      x_p = predicted_p
      y_ = y[i]
      i += 1
      hist_p.append(copy.copy(x_p))
      hist_y.append(copy.copy(y_))
      #print(i, x_p)

    #hist_x = [l.tolist() for l in hist_x]
    #print(hist_x[0])
    #showRF(forests[id])
    p = np.array(hist_p)
    y = np.array(hist_y)

    joint = np.concatenate([y, p], axis=1)
    df = pd.DataFrame(joint, columns = np.append([h+"(actual)" for h in y_h], [h+"(predicted)" for h in y_h]))
    df.to_csv("result"+l+"_"+str(j)+".csv")

    g = Graphics()
    fig = g.dayPlot2D("20191231", y=[y.T, p.T], title=["Actual", "Predicted"], xlabel=["day","day"], ylabel=[y_h, y_h])
    fig.savefig("Predicted"+l+"_"+str(j)+".png")

    return mean_squared_error(y.T[0], p.T[0])

    """
    plt.scatter(delY, predicted, alpha=0.3)
    plt.xlabel("TOMGRO")
    plt.ylabel("Predicted")
    plt.show()
    """

    #相関係数
    #return np.dot(predicted, delY)/(np.linalg.norm(predicted, ord=2)*np.linalg.norm(delY, ord=2))
    return 0

  def gradation(self, forest, fix, range1, range2):
    hist_p = np.empty(0)
    for i, x1 in enumerate(range1):
      forsee_r = np.empty(0)
      for j, x2 in enumerate(range2):
        x = np.array(copy.copy(fix))
        x = np.where(x==-1, x1, x)
        x = np.where(x==-2, x2, x)
        forsee = forest.predict(x.reshape(1,-1))
        print(i, j, x, forsee)
        forsee_r = np.append(forsee_r, copy.copy(forsee))
      forsee_r = np.insert(forsee_r, 0, range1[i])
      hist_p = np.append(hist_p, copy.copy(forsee_r))
    hist_p = np.insert(hist_p, 0, range2)
    hist_p = np.insert(hist_p, 0, 0)
    hist_p = hist_p.reshape(-1, len(range2)+1)

    df = pd.DataFrame(hist_p)
    df.to_csv("gradation.csv")

  def showRF(self, rf):
    estimator = rf.estimators_[0]
    filename = "./tree.png"
    dot_data = tree.export_graphviz(
                estimator,
                out_file=None,
                filled=True,
                rounded=True,
                special_characters=True
                )
    graph = pdp.graph_from_dot_data(dot_data)
    graph.write_png(filename)
	class Gene:
	def __init__(self, num, mins, maxs):
	one_gene = np.ones(num)
	set_gene = np.stack([one_gene, mins, maxs], axis=1)
	init_gene = [x[0] * random.uniform(x[1], x[2]) for x in set_gene]
	self.gene = init_gene
	self.num = num
	self.mins = mins
	self.maxs = maxs
	self.fitness = 0

	@classmethod
	def getFitness(self, genes, model):
	x = np.array([g.gene for g in genes]).reshape(-1, genes[0].num)
	predicted = model.predict(x)
	for i, p in enumerate(predicted):
	genes[i].fitness = p

	def getFitness_(self):
	self.fitness = self.gene[0] - self.gene[1]
	return self.fitness

	def mutation(self, p=0.2):
	r = random.randint(0, 10)
	if(r <= p*10):
	one_gene = np.ones(self.num)
	set_gene = np.stack([one_gene, self.mins, self.maxs], axis=1)
	init_gene = [x[0] * random.uniform(x[1], x[2]) for x in set_gene]
	rand_i = random.randint(0, self.num-1)
	self.gene[rand_i] = init_gene[rand_i]

	@classmethod
	def sortGene(self, genes, model):
	self.getFitness(genes, model)
	genes = sorted(genes, key=lambda g: g.fitness, reverse=True)
	return genes

	@classmethod
	def select(self, genes):
	l = int(len(genes)/4)
	try:
	return genes[:l]
	except TypeError:
	print("Removed the last gene")
	genes.pop()
	return genes[:l]

	def cross(self, g1, g2):
	rand_i = random.randint(1, g1.num-1)
	g1_, g2_ = Gene(g1.num, g1.mins, g1.maxs), Gene(g1.num, g1.mins, g1.maxs)
	f1, b1= g1.gene[:rand_i], g1.gene[rand_i:]
	f2, b2= g2.gene[:rand_i], g2.gene[rand_i:]
	g1_.gene, g2_.gene = f1 + b2, f2 + b1
	return [g1, g2, g1_, g2_]

	@classmethod
	def crossover(self, genes):
	new_genes = []
	l = len(genes)
	while(len(genes)*4 > len(new_genes)):
	r1, r2 = random.randint(0,l-1), random.randint(0,l-1)
	family = self.cross(self, genes[r1], genes[r2])
	new_genes += family
	return new_genes
	class Graphics():
	def dayPlot2D(self, startday, y, title, xlabel, ylabel):
	row = y[1].shape[0]
	col = 2

	data_range = pd.date_range(startday, periods=y[1].shape[1], freq='d')
	plt.rcParams["font.size"] = 12

	fig = plt.figure(figsize=(15,20))

	for i in range(y[0].shape[0]):
	axL = fig.add_subplot(row, col, 1+i*2)
	axL.plot(data_range, y[0][i], linewidth=2)
	if(i==0): axL.set_title(title[0], fontsize=18)
	axL.set_xlabel(xlabel[0], fontsize=18)
	axL.set_ylabel(ylabel[0][i], fontsize=18)
	axL.grid(True)

	axR = fig.add_subplot(row, col, 2+i*2)
	axR.plot(data_range, y[1][i], linewidth=2)
	if(i==0): axR.set_title(title[1], fontsize=18)
	axR.set_xlabel(xlabel[1], fontsize=18)
	axR.set_ylabel(ylabel[1][i], fontsize=18)
	axR.grid(True)

	for ax in fig.axes:
	plt.sca(ax)
	plt.xticks(rotation=30)

	fig.show()

	return fig
	import numpy as np
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	import matplotlib.animation as animation
	import random
	from mpl_toolkits.mplot3d import Axes3D
	from scipy.interpolate import griddata
	import pandas as pd
	from keras import regularizers
	import datetime
	import random as rnd
	from sklearn.metrics import r2_score
	import math
	from tqdm import tqdm
	import copy
	from sklearn.metrics import mean_squared_error

	#Random Forest
	from sklearn.ensemble import RandomForestRegressor
	from sklearn import tree
	import pydotplus as pdp
	import pickle
	from PIL import Image
	from io import BytesIO
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import KFold

	import warnings
	warnings.filterwarnings("ignore")

	if __name__ == '__main__':
	rf = RF()
	df = pd.read_csv('kyuri_liquid_202001-06.csv') # file name
	df_ls, df_vs = rf.split(df=df, learning=0.8, fold=5)
	#print(np.sort(df_ls[0].values[:,0]))
	#print(np.sort(df_v.values[:,0]))

	x_h_c = ["radiation"
	, "temp"
	, "day_temp"
	, "night_temp"
	, "day_satiety"
	, "night_satiety"] # inputs (climate)
	"""
	# For net_photo
	x_h_c = ["radiation"
	, "night_temp"
	, "temp"
	, "day_temp"]

	# For net_photo
	x_h_c = ["radiation"
	, "temp"
	, "day_temp"]
	"""


	x_h = x_h_c
	y_h = ["net_photo"] # outputs

	for j in range(len(df_ls)):
	df_l = df_ls[j]
	df_v = df_vs[j]
	print(df_l.values[:,0])
	x = rf.prepare(df_l, x_h)

	# Training
	forests = []
	for targ in y_h:
	forest, indices, importances = rf.machineL(x, df_l[targ], targ, 0) # x, y
	forests.append(forest)
	#rf.showRF(forests[0])
	# Validation
	#rf.validation(forests, df_v, x_h_c, y_h, j)
	#rf.gradation(forests[0], [20.0, 25.0, -1, -2, 6.0, 2.0], np.arange(11)4, np.arange(11)4)
	print("-----")

	# ---High contributing
	high_cont = False
	if(high_cont):
	hc_x_h = []
	for i, hc in enumerate(indices):
	hc_x_h.append(x_h_c[hc])
	x = rf.prepare(df_l, hc_x_h)
	forests = []
	for targ in y_h:
	forest, indices, importances = rf.machineL(x, df_l[targ], targ, j) # x, y
	forest = pickle.load(open(targ+str(j)+'.sav', 'rb'))
	forests.append(forest)
	print("-")
	#print(rf.validation(forests, df_v, hc_x_h, y_h, j))
	print("-----")

	"""
	genes = [Gene(6, [5,20,28,14,5,1],[20,30,33,26,10,4]) for i in range(10000)]
	for i in range(100):
	[g.mutation() for g in genes]
	genes = Gene.sortGene(genes, forests[0])
	print(i, genes[0].gene, genes[0].fitness)
	genes = Gene.select(genes)
	genes = Gene.crossover(genes)
	"""
	class RF:
	def split(self, df, learning=0.8, fold=5):
	#l = int(df.shape[0]*learning)
	#df_l, df_v = train_test_split(df, train_size=learning)
	#df_l, df_v = df[:l], df[l:]
	kf = KFold(n_splits=fold, shuffle=False, random_state=0)
	df_l, df_v = [], []
	for i, (train_index, test_index) in enumerate(kf.split(df)):
	df_l.append([]), df_v.append([])
	df_l[i]=df.iloc[train_index]
	df_v[i]=df.iloc[test_index]
	return df_l, df_v

	def prepare(self, df, header):
	integrated = np.stack([df[h] for h in header])
	integrated = integrated.T
	return integrated

	def machineL(self, x, y, target, j):
	#モデル
	forest = RandomForestRegressor(n_estimators=1000, n_jobs = -1)
	f = forest.fit(x, y)
	pickle.dump(forest, open('./'+target+str(j)+'.sav','wb'))
	#print("score:", f.score(t_x, t_y))

	importances = forest.feature_importances_
	indices = np.argsort(importances)[::-1]
	for f in range(x.shape[1]):
	print("%d. feature %d (%f)" % (f+1, indices[f], importances[indices[f]]))

	return forest, indices, importances

	def validation(self, forests, df, x_h_c, y_h, j):
	#評価
	l = str(len(x_h_c))

	x_c = rf.prepare(df, x_h_c) # climate data
	y = rf.prepare(df, y_h) # Actual data
	hist_p = []
	hist_y = []

	i = 0
	for eachx_c in x_c:
	x = eachx_c
	predicted_p = np.empty(0)
	for forest in forests:
	forsee = forest.predict(x.reshape(1,-1))
	if(abs(forsee) < 0.0001):
	forsee = 0
	predicted_p = np.append(predicted_p, forsee)
	#x_p += predicted_x # integrated
	#y_ += y[i] # integrated
	x_p = predicted_p
	y_ = y[i]
	i += 1
	hist_p.append(copy.copy(x_p))
	hist_y.append(copy.copy(y_))
	#print(i, x_p)

	#hist_x = [l.tolist() for l in hist_x]
	#print(hist_x[0])
	#showRF(forests[id])
	p = np.array(hist_p)
	y = np.array(hist_y)

	joint = np.concatenate([y, p], axis=1)
	df = pd.DataFrame(joint, columns = np.append([h+"(actual)" for h in y_h], [h+"(predicted)" for h in y_h]))
	df.to_csv("result"+l+"_"+str(j)+".csv")

	g = Graphics()
	fig = g.dayPlot2D("20191231", y=[y.T, p.T], title=["Actual", "Predicted"], xlabel=["day","day"], ylabel=[y_h, y_h])
	fig.savefig("Predicted"+l+"_"+str(j)+".png")

	return mean_squared_error(y.T[0], p.T[0])

	"""
	plt.scatter(delY, predicted, alpha=0.3)
	plt.xlabel("TOMGRO")
	plt.ylabel("Predicted")
	plt.show()
	"""

	#相関係数
	#return np.dot(predicted, delY)/(np.linalg.norm(predicted, ord=2)*np.linalg.norm(delY, ord=2))
	return 0

	def gradation(self, forest, fix, range1, range2):
	hist_p = np.empty(0)
	for i, x1 in enumerate(range1):
	forsee_r = np.empty(0)
	for j, x2 in enumerate(range2):
	x = np.array(copy.copy(fix))
	x = np.where(x==-1, x1, x)
	x = np.where(x==-2, x2, x)
	forsee = forest.predict(x.reshape(1,-1))
	print(i, j, x, forsee)
	forsee_r = np.append(forsee_r, copy.copy(forsee))
	forsee_r = np.insert(forsee_r, 0, range1[i])
	hist_p = np.append(hist_p, copy.copy(forsee_r))
	hist_p = np.insert(hist_p, 0, range2)
	hist_p = np.insert(hist_p, 0, 0)
	hist_p = hist_p.reshape(-1, len(range2)+1)

	df = pd.DataFrame(hist_p)
	df.to_csv("gradation.csv")

	def showRF(self, rf):
	estimator = rf.estimators_[0]
	filename = "./tree.png"
	dot_data = tree.export_graphviz(
	estimator,
	out_file=None,
	filled=True,
	rounded=True,
	special_characters=True
	)
	graph = pdp.graph_from_dot_data(dot_data)
	graph.write_png(filename)