Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inoccu/af1de96ed58ed8b6f339a5c1a9989dac to your computer and use it in GitHub Desktop.
Save inoccu/af1de96ed58ed8b6f339a5c1a9989dac to your computer and use it in GitHub Desktop.

Pythonによるデータ分析・機械学習プログラミングとAIのローコード開発

1.データ分析のためのPythonプログラミング

Pythonの基本構文(1)

for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print('Fizz Buzz!')
    elif i % 3 == 0:
        print('Fizz!')
    elif i % 5 == 0:
        print('Buzz!')
    else:
        print(i)

関数の引数

print('abc', 'def', 'ghi')
print('abc', 'def', 'ghi', sep='::')

int(整数)型

a = (5 + 3 - 1) * 9
print(str(a))

float(浮動小数点)型

a = (5.0 + 3.0 - 1.0) * 9.0 / 10.0 % 6.0
print(str(a))
print(str(int(a)))
from decimal import Decimal
a = (Decimal(5.0) + Decimal(3.0) - Decimal(1.0)) * Decimal(9.0) / Decimal(10.0) % Decimal(6.0)
print(str(a))

str(文字列)型(1)

str1 = 'Hello'
print(str1)

str2 = "I'm sure that "
str3 = "you can write python."
str4 = str2 + str3
print(str4)

str(文字列)型(2)

print('str2:%s str3:%s' % (str2, str3))

str(文字列)型(3)

for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print('Fizz Buzz!')
    elif i % 3 == 0:
        print('Fizz!')
    elif i % 5 == 0:
        print('Buzz!')
    else:
        print(f'{i}は3または5では割り切れません')

str(文字列)型(4)

print(str4[4])
print(str4[4:10])
print(str4[4:])

データ型の確認

print(type(str4))

if type(str4) is str:
    print('str4 is str')
else:
    print('str4 is not str')

Pythonのライブラリと関数の呼び出し

import datetime
datetime.datetime.now()
from datetime import datetime as dt
dt.now()

インストールされているパッケージの確認

!pip list

requestsパッケージのインストール

!pip install requests

インストールされているパッケージの記録

!pip freeze
!pip freeze > requirements.txt
!pip install -r requirements.txt

requestsでデータをダウンロード

import requests
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data')
with open('imports-85.data', mode='wb') as f:
    f.write(response.content)

タイトル行の追加

with open('imports-85.data', mode='r') as f:
    data = f.read()

title = 'symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price'
data = f'{title}\n{data}'

with open('automobile.csv', mode='w') as f:
    f.write(data)

2.データ分析の基礎と応⽤

!pip install pandas

Pandas(1)データの読み込みと表⽰

import pandas as pd
pd.set_option('display.max_columns', 30)

df = pd.read_csv('automobile.csv')
df.head()

Pandas(2)データの形状と列名の取得

print('shape:', df.shape)
print('columns:', df.columns)

Pandas(3)列名を指定したデータの取得

df[['make', 'price']]

Pandas(4)条件を指定したデータの取得

df[['make', 'width', 'price']].query('width > 70')
df[df['width'] > 70][['make', 'width', 'price']]

Pandas(5)indexを指定したデータの取得

df[['make', 'price']].loc[10:15]

Pandas(6)グルーピングと統計計算

df[['make', 'width']].groupby(['make']).mean()

Pandas(7)基本統計量を⾒る

df.describe()

Pandas(8)型の確認

print(df.dtypes)

Pandas(9)特定⽂字列のNaNへの置換と型変換

import numpy as np

df['normalized-losses'] = df['normalized-losses'].replace('?', np.nan)
df['bore'] = df['bore'].replace('?', np.nan)
df['stroke'] = df['stroke'].replace('?', np.nan)
df['horsepower'] = df['horsepower'].replace('?', np.nan)
df['peak-rpm'] = df['peak-rpm'].replace('?', np.nan)
df['price'] = df['price'].replace('?', np.nan)
df = df.astype({'normalized-losses': 'float64', 'bore': 'float64', 'stroke': 'float64', 'horsepower': 'float64', 'peak-rpm': 'float64', 'price': 'float64'})

Pandas(10)量的データに絞り込んで基本統計量を⾒る

df[['width', 'length', 'horsepower', 'price']].describe()

分散

df[['horsepower', 'price']].var()

Pandas(11)カテゴリデータ(質的データ)の値の種類を⾒る

print(df['make'].unique())
print(type(df['make']))

Pandas(12)⽋損値の確認

df.isnull().sum()

Pandas(13)⽋損値のある⾏の削除

print('df count:', df.count())
df_a = df.dropna()
print('df_a count:', df_a.count())

Pandas(14)⽋損値の補完

df_b = df.fillna({
    'normalized-losses': df['normalized-losses'].median(),
    'bore': df['bore'].median(),
    'stroke': df['stroke'].median(),
    'horsepower': df['horsepower'].median(),
    'peak-rpm': df['peak-rpm'].median(),
    'price': df['price'].median()
})
df_b.isnull().sum()

データの可視化

!pip install matplotlib seaborn
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

価格のヒストグラム

df_b['price'].hist()

メーカーの棒グラフ

make_count = df[['make']].value_counts()
make_count
make_count.plot.bar()

⾺⼒と価格の相関係数を求める

df_b[['horsepower', 'price']].corr()

相関⾏列の作成(1)

import seaborn as sns
plt.figure(figsize=(9, 9))
sns.heatmap(df_b.corr(), annot=True)

カテゴリ変数のダミー変数化

df2 = pd.get_dummies(df_b[['make']])
df2

DataFrameの結合

df3 = pd.concat([df_b.drop(['make'], axis=1), df2], axis=1)
df3

相関⾏列の作成(2)

plt.figure(figsize=(12, 12))
sns.heatmap(df3.corr(), annot=True)

ラベルエンコーディングの使⽤

from sklearn.preprocessing import LabelEncoder
for category in ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']:
	le = LabelEncoder()
	le.fit(df_b[category])
	df_b[category] = le.transform(df_b[category])
df_b```

### 相関⾏列の作成(3)
```python
plt.figure(figsize=(15, 15))
sns.heatmap(df_b.corr(), annot=True)

散布図⾏列を描く

pd.plotting.scatter_matrix(df_b[['drive-wheels', 'wheel-base', 'length', 'width', 'curb-weight', 'engine-size', 'fuel-system', 'bore', 'horsepower', 'city-mpg', 'highway-mpg', 'price']],
figsize=(15,15), range_padding=0.2)
plt.show()

加⼯したデータフレームをCSVファイルとして保存

df_b.to_csv('automobile_converted.csv', index=False)

3.機械学習による予測モデルの作成

データの読み込み

import pandas as pd

df = pd.read_csv('automobile_converted.csv')
df.head()

⽬的変数と説明変数

# 説明変数
X_var = df.drop('price', axis=1)
X_array = X_var.values

# ⽬的変数
y_var = df['price']
y_array = y_var.values

訓練データとテストデータ

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, train_size=0.8, random_state=0)

線形回帰で機械学習

from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

傾きと切⽚

print('傾き: %s' % model.coef_)
print('切⽚: %s' % model.intercept_)

学習済みモデルの評価

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

学習済みモデルで予測する

model.predict([X_test[0]])
y_test[0]

決定⽊でのモデル作成

from sklearn.tree import DecisionTreeRegressor

# 決定⽊回帰
dtr = DecisionTreeRegressor(
    max_depth=3
)
dtr.fit(X_train, y_train)
print(dtr.score(X_train, y_train))
print(dtr.score(X_test, y_test))

決定⽊を描く

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(
    dtr,
    label='all', # all, none
    max_depth=3,
    filled=True,
    feature_names=df.columns,
    fontsize=12,
)
plt.show()

ランダムフォレストでのモデル作成

from sklearn.ensemble import RandomForestRegressor

# ランダムフォレスト回帰
rfr = RandomForestRegressor(
    n_estimators=50,
    max_depth=3
)
rfr.fit(X_train, y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test, y_test))

XGBoostでのモデル作成

!pip install xgboost
from xgboost import XGBRegressor

# XGBoost回帰
xgb = XGBRegressor(
    n_estimators=50,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

4.ローコードAIツールPycaretの活⽤と機械学習の実践

Pycaretのインストール

!pip install pycaret

データの読み込みと分割

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('automobile_converted.csv')
train = df.sample(frac=0.8, random_state=111)
test = df.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('train: ' + str(train.shape))
print('test: ' + str(test.shape))

Pycaretにデータをセットアップ

from pycaret.regression import *
reg01 = setup(data=train, target='price')

アルゴリズムの選択

best_model = compare_models()

モデルの作成

model = create_model('ridge')

ハイパーパラメータのチューニング

tuned = tune_model(model)

モデルの評価

evaluate_model(tuned)

モデルのファイナライズ

final = finalize_model(tuned)
print(final)

予測の実行と精度評価

test_predictions = predict_model(final, data=test)
test_predictions.head()

モデルの保存と使用

save_model(final, 'automobile_final')
from pycaret.regression import *
import pandas as pd

model = load_model('automobile_final')

data = [[2.0, 164.0, 1.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 99.4, 176.6, 66.4, 54.3, 2824.0, 3.0, 1.0, 136.0, 5.0, 3.19, 3.4, 8.0, 115.0, 5500.0, 18.0, 22.0]]
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

df = pd.DataFrame(data, columns=columns)
predictions = predict_model(model, df)
print('predicted price: %i' % predictions[['prediction_label']].values[0][0])

作成したモデルのデプロイ

FlaskとNgrokのインストール

!pip install flask flask-ngrok pyngrok
!ngrok authtoken <token>

Flaskアプリの作成

from pycaret.regression import *
import pandas as pd
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/predict', methods=['POST'])
def predict():
  payload = request.json
  columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg']

  df = pd.DataFrame(payload['data'], columns=columns)
  model = load_model('automobile_final')
  predictions = predict_model(model, df)
  return jsonify({
      'price': float(predictions[['prediction_label']].values[0][0])
  })

app.run()

APIの実行

!pip install requests
import json
import requests

response = requests.post(
    'http://ce59a77d4613.ngrok.io/predict',
    headers={'Content-Type': 'application/json'},
    data=json.dumps({'data': [[2.0, 164.0, 1.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 99.4, 176.6, 66.4, 54.3, 2824.0, 3.0, 1.0, 136.0, 5.0, 3.19, 3.4, 8.0, 115.0, 5500.0, 18.0, 22.0]]})
)
result = response.json()
result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment