Skip to content

Instantly share code, notes, and snippets.

@spidezad
Created May 21, 2019 06:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spidezad/ea0092cce47865bd0389d7d53a241274 to your computer and use it in GitHub Desktop.
Save spidezad/ea0092cce47865bd0389d7d53a241274 to your computer and use it in GitHub Desktop.
shopee_for_wordpress_publish.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport os, sys, datetime, re\n\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.metrics import classification_report\n\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfTransformer\n\nimport matplotlib.pyplot as plt",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "fname = 'mobile_data_info_train_competition.csv'\ndf = pd.read_csv(fname)",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "plt.figure()\ndf.hist(figsize=(10,10))\nplt.show()",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "<Figure size 432x288 with 0 Axes>"
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": "<Figure size 720x720 with 12 Axes>"
},
"metadata": {},
"output_type": "display_data"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def clean_text(text):\n \"\"\" Return: modified cleaned initial string.\n \"\"\"\n\n text = text.lower() \n text = re.sub('[/(){}\\[\\]\\|@,;]',' ', text) # replace with space\n text = re.sub('[^0-9a-z #+_]', '', text) # remove special characters\n\n return text",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# replace nan with best value based on exploration\ndf['Warranty Period'] = df['Warranty Period'].fillna(df['Warranty Period'].mode()[0])\ndf['Operating System'] = df['Operating System'].fillna(df['Operating System'].mode()[0])\ndf['Network Connections'] = df['Network Connections'].fillna(df['Network Connections'].mode()[0])",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Convert attributes to integer\nfor col in [ 'Operating System', 'Features','Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',\\\n 'Storage Capacity', 'Color Family', 'Phone Model', 'Camera', 'Phone Screen Size']:\n df[col] = df[col].fillna(-1)\n df[col] = df[col].astype('int')\n df[col] = df[col].astype('str')\n df[col] = df[col].replace('-1', np.nan)\n\ndf['title1'] = df['title'].apply(clean_text)",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Prepare model -- Drop na and keep those with values\ndef get_X_Y_data(x_col, y_col):\n sub_df = df[[x_col, y_col]]\n sub_df.head()\n sub_df = sub_df.dropna()\n return sub_df[x_col], sub_df[y_col]\n\ndef generate_model(X, y):\n \n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)\n\n pred_model = Pipeline([('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),#hinge\n ])\n pred_model.fit(X_train, y_train)\n\n y_pred = pred_model.predict(X_test)\n\n print('accuracy %s' % accuracy_score(y_pred, y_test))\n #print(classification_report(y_test, y_pred))\n \n return pred_model\n \n\nX, y = get_X_Y_data('title1', 'Brand')\nbrand_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Operating System')\nos_model = generate_model(X, y) \nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Network Connections')\nnetwork_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Warranty Period')\nwarranty_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Color Family')\ncolor_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Phone Model')\nphonemodel_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Storage Capacity')\nstorage_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Memory RAM')\nram_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Phone Screen Size')\nscreen_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Features')\nfeature_model = generate_model(X, y)\nprint('='*29)\n\nX, y = get_X_Y_data('title1', 'Camera')\ncamera_model = generate_model(X, y)\nprint('='*29)",
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": "accuracy 0.9806071551427589\n=============================\naccuracy 0.9536996611156157\n=============================\naccuracy 0.9645938585001768\n=============================\naccuracy 0.9384186781429967\n=============================\naccuracy 0.7848330058939096\n=============================\naccuracy 0.8950025637991559\n=============================\naccuracy 0.9249406175771971\n=============================\naccuracy 0.8397615708274895\n=============================\naccuracy 0.6734631460903252\n=============================\naccuracy 0.7034056252117926\n=============================\naccuracy 0.6215263657313271\n=============================\n",
"name": "stdout"
}
]
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/91545bc442cb4f4e5d5387f3bfb03f55"
},
"gist": {
"id": "91545bc442cb4f4e5d5387f3bfb03f55",
"data": {
"description": "shopee_for_wordpress_publish.ipynb",
"public": true
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment