Last active
October 31, 2017 05:56
-
-
Save djfan/7ad556a8f0f045c04a09f960c8154e76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Oct 30 09:26:59 2017 | |
@author: yvette.wang | |
""" | |
import pandas as pd | |
import numpy as np | |
import re | |
add_input = './DCM Lookup Table.xlsx' | |
sheet_name = 'Sheet2' | |
add_output = './output3.xlsx' | |
#lookup Channel | |
df1=pd.read_excel(add_input, sheetname=sheet_name) | |
df1 = df1.dropna(how='all') | |
df1['Channel'] = np.where(df1['Ad group'].astype(str).str.contains('TrueView')|df1['Ad group'].astype(str).str.contains('trueview')|df1['Ad group'].astype(str).str.contains('truewiew')|df1['Ad group'].astype(str).str.contains('_YT_')==True, 'Display-TrueView', 'Display') | |
print('Channel Finished') | |
#lookup medium | |
#1.Python cannot read this specific character '|mobile|', but we need to lookup all together in 'Ad group' | |
df1['Medium'] = np.where(df1['Ad group'].astype(str).str.contains('_Mobile')|df1['Ad group'].astype(str).str.contains('\|mobile\|'),'Mobile','Desktop') | |
print('Medium Finished') | |
#lookup Brand | |
colname = 'Brand' | |
campaign = df1.Campaign | |
brands = [] | |
for s in campaign: | |
s = s.lower() | |
if s.find('evinrude') != -1: | |
brands.append('Evinrude') | |
elif s.find('spyder') != -1: | |
brands.append('Spyder') | |
elif s.find('ski-doo') != -1: | |
brands.append('Ski-Doo') | |
elif s.find('sea-doo') != -1: | |
brands.append('Sea-Doo') | |
else: | |
brands.append('Can-Am ORV') | |
# append to corresponding column | |
df1[colname] = brands | |
print('Brand Finished') | |
#Lookup Size | |
colname = 'Size' | |
# Patterns | |
# pa1 = re.compile(r"video|_OLV_") | |
# pa2 = re.compile(r"[0-9\s]+x[0-9\s]+") | |
pa2 = re.compile(r"[0-9]+x[0-9]+") | |
# pa21 = re.compile(r"[_\s]([0-9\s]+x[0-9\s]+)") | |
# pa22 = re.compile(r"[_\w]([0-9][0-9\s]*x[0-9\s]+)") | |
pa3 = re.compile(r"[0-9]+s|[_:]15[_\s]|[_:]30[_\s]|[_:]45[_\s]|[_:]60[_\s]") | |
pa4 = re.compile(r'[0-9]+s') | |
# data pre-processing | |
d = df1.loc[:,'Ad group'] | |
d = d.dropna() | |
# Video or Display | |
#2. when a placement include both _video_/_trueview_ and 1x1 together, then return to 15s | |
video_trueview = [] | |
for i in range(len(d)): | |
flag = ((d[i].lower().find('_video_') != -1) or (d[i].lower().find('_trueview_') != -1)) | |
if flag: | |
video_trueview.append(1) | |
else: | |
video_trueview.append(0) | |
#3. when placement doesn't include display or video or Trueview, which means we can't identify the placement channel, then size return to False | |
none = [] | |
for i in range (len(d)): | |
flag = ((d[i].lower().find('display') == -1) and (d[i].lower().find('video') == -1) and (d[i].lower().find('trueview') == -1)) | |
if flag: | |
none.append(1) | |
else: | |
none.append(0) | |
# video = [] | |
# for i in range(len(d)): | |
# flag = ((d[i].lower().find('video') != -1) or (d[i].find('_OLV_') != -1)) | |
# if flag: | |
# video.append(1) | |
# else: | |
# video.append(0) | |
# remove space function | |
remove_space = lambda x: x.replace(' ', '') | |
# extract size for each record | |
all_size = [] | |
for i in range(len(d)): | |
result = re.findall(pa3, d[i]) | |
if len(result) != 0: | |
res_sec = re.findall(pa4, result[0]) | |
if len(res_sec) > 0: | |
all_size.append(result) | |
else: | |
all_size.append([result[0].replace('_', '').replace(' ','').replace(':', '') + 's']) | |
else: | |
result = re.findall(pa2, d[i].replace(' x ', 'x')) | |
res_no_space = map(remove_space, result) | |
res_unique = list(set(res_no_space)) | |
if len(res_unique) > 1: | |
try: | |
res_unique.remove(u'1x1') | |
res_unique.remove(u'0x0') | |
except ValueError: | |
pass | |
if (len(res_unique) > 1 or res_unique == [u'0x0']): | |
all_size.append(False) | |
else: | |
if video_trueview[i] == 1 and res_unique == [u'1x1']: # #2 solved | |
all_size.append([u'15s']) | |
elif none[i] == 1 and res_unique == [u'1x1']: # #3 solved | |
all_size.append(False) | |
#2. if video[i] == 1 and res_unique == [u'1x1']: | |
# all_size.append(u'15s') | |
#3. if none[i] == 1 and res_unique == [u'1x1']: | |
# all_size.append(False) | |
else: | |
all_size.append(res_unique) | |
all_size = [s[0] if s else 'False' for s in all_size] | |
print('length:', len(all_size), '#False:', all_size.count(False)) | |
df1[colname] = all_size | |
print('Size Finished') | |
# 4. Lookup Language | |
colname = 'Language' | |
ddf = df1.loc[:,['Ad group', 'Country']] | |
languageresults = [] | |
for i in range(len(ddf)): | |
if ddf.loc[i, 'Country'] == 'US': | |
languageresults.append('EN') | |
else: | |
languageresults.append(np.where((df1.loc[i,'Ad group'].find('_FR_')!=-1)|(df1.loc[i,'Ad group'].find('|fr|')!=-1), 'FR', 'EN')) # here, '|fr|' not '\|fr\|'. str.find() is noe like pd.Series.contain() | |
df1[colname] = languageresults | |
print('Language Finished') | |
# ''' | |
# Partner Type | |
# ''' | |
colname = 'Partner Type' | |
df = df1.loc[:,['Publisher', 'Ad group', 'Size', 'Channel']] | |
# df.Size = df.Size | |
# df.Size = map(lambda x: x.lower(), df.Size) | |
# df.Size = [s.lower() if s else False for s in df.Size] | |
# print df.Size[0] | |
PBU = ['PBU Canada', 'AppNexus', 'DOUBLECLICK TECH', 'TubeMogul'] | |
video_pattern = re.compile(r"[0-9]+s") | |
results = [] | |
for i in range(len(df)): | |
if df.loc[i, 'Channel'] == 'Display-TrueView': | |
results.append('Programmatic Video') | |
else: | |
if df.loc[i, 'Publisher'] in PBU: | |
try: | |
flag = len(re.findall(video_pattern, df.loc[i, 'Size'])) > 0 | |
if flag: | |
results.append('Programmatic Video') | |
else: | |
results.append('Programmatic') | |
except TypeError: | |
results.append(False) | |
else: | |
try: | |
flag = len(re.findall(video_pattern, df.loc[i, 'Size'])) > 0 | |
if flag: | |
results.append('Contextual Video') | |
else: | |
results.append('Contextual') | |
except TypeError: | |
results.append(False) | |
# I need one more rule that if size is False then partner type return False | |
# [Solved!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!] | |
print(results.count('Programmatic Video'), results.count('Programmatic'), results.count('Contextual Video'), results.count('Contextual')) | |
df1[colname] = results | |
print('Partner Type Finished') | |
# write out | |
df1.to_excel(add_output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment