Skip to content

Instantly share code, notes, and snippets.

View gcsfred's full-sized avatar

Gustavo Frederico gcsfred

View GitHub Profile
@gcsfred
gcsfred / AuthConfig.cs
Created October 4, 2013 04:33
AuthConfig.cs no projeto TesteSimpleMembershipEmpty
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.Web.WebPages.OAuth;
using WebMatrix.WebData;
namespace TesteSimpleMembershipEmpty
{
public static class AuthConfig
@gcsfred
gcsfred / web.config
Created October 4, 2013 05:21
web.config no projeto TesteSimpleMembershipEmpty
<?xml version="1.0" encoding="utf-8"?>
<!--
Para obter mais informações sobre como configurar o aplicativo ASP.NET, visite
http://go.microsoft.com/fwlink/?LinkId=169433
-->
<configuration>
<configSections>
<!-- For more information on Entity Framework configuration, visit http://go.microsoft.com/fwlink/?LinkID=237468 -->
<section name="entityFramework" type="System.Data.Entity.Internal.ConfigFile.EntityFrameworkSection, EntityFramework, Version=5.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" requirePermission="false" />
<sectionGroup name="dotNetOpenAuth" type="DotNetOpenAuth.Configuration.DotNetOpenAuthSection, DotNetOpenAuth.Core">
@gcsfred
gcsfred / sample.py
Created November 15, 2018 12:38
OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
# ...
def one_hot_encode(_df, input_column, output_column):
indexer = StringIndexer(inputCol=input_column, outputCol=input_column+"_indexed", handleInvalid='skip')
_model = indexer.fit(_df)
_td = _model.transform(_df)
encoder = OneHotEncoder(inputCol=input_column+"_indexed", outputCol=output_column, dropLast=True)
@gcsfred
gcsfred / pandas_udf_sample.py
Created November 15, 2018 12:42
pandas_udf fill missing and empty string
from pyspark.sql.functions import pandas_udf
#...
# Use pandas_udf to define a Pandas UDF
@pandas_udf('string')
# Input/output are both a pandas.Series of string
def pandas_not_null(s):
return s.fillna("_NO_₦Ӑ_").replace('', '_NO_ӖӍΡṬΫ_')
@gcsfred
gcsfred / dataframe_using_pandas_udf.py
Created November 15, 2018 12:43
dataframe using pandas_udf and one hot encode
dataframe = dataframe.withColumn('ACOLUMN_not_null', pandas_not_null('ACOLUMN'))
dataframe = one_hot_encode(dataframe, "ACOLUMN_not_null", "ACOLUMN_one_hot")
@gcsfred
gcsfred / pandas_udf_nlp.py
Created November 15, 2018 12:55
define a pandas_udf annotated function that vectorizes a column of text from a DataFrame
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import spacy
#...
# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en_core_web_sm')
#...
# Use pandas_udf to define a Pandas UDF
@gcsfred
gcsfred / dataframe_using_pandas_udf.py
Created November 15, 2018 13:02
dataframe creating a column using pandas_udf
dataframe = dataframe.withColumn('description_vec', pandas_nlp('description'))
@gcsfred
gcsfred / udf_two_columns_concat.py
Created November 17, 2018 19:32
Concatenate two columns of a DataFrame using UDF
import pyspark.sql.functions as f
import pyspark.sql.types as t
# ...
def udf_concat_vec(a, b):
# a and b of type SparseVector
return np.concatenate((a.toArray(), b.toArray())).tolist()
my_udf_concat_vec = f.UserDefinedFunction(udf_concat_vec, t.ArrayType(t.FloatType()))
@gcsfred
gcsfred / UDF_variable_num_columns.py
Last active November 19, 2018 15:37
Creating a new column in a DataFrame based on a variable number of other columns.
import pyspark.sql.functions as f
import pyspark.sql.types as t
# ...
data_frame = data_frame.withColumn('columnB', data_frame['columnA'])
data_frame = data_frame.withColumn('columnC', data_frame['columnA'])
attrs = ['columnA', 'columnB', 'columnC']
@gcsfred
gcsfred / search_with_personalization_fragment.py
Created February 3, 2020 15:49
Search with personalization (fragment)
def search_with_personalization(user, search):
config = configparser.ConfigParser()
config.read('config.conf')
categories = get_category_recommendations(config, user)
ranked_categories = get_category_ranking(config, user, categories)
products = get_product_recommendations(config, user)
ranked_products = get_product_ranking(config, user, products)
query_es(search, ranked_categories, ranked_products)