Skip to content

Instantly share code, notes, and snippets.

@Geobm
Created August 30, 2020 12:39
Show Gist options
  • Save Geobm/a9a6af793b68354c419c454f36be25f5 to your computer and use it in GitHub Desktop.
Save Geobm/a9a6af793b68354c419c454f36be25f5 to your computer and use it in GitHub Desktop.
Match.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Match.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyMCYTAOVk31bhWi+bv3s7HC",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Geobm/a9a6af793b68354c419c454f36be25f5/match.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "CKcxn_xGHlkv",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "700ccaf8-101a-424d-b1c7-90e6cafa11ad"
},
"source": [
"import nltk\n",
"nltk.download('all')\n",
"\n",
"\n",
"\n",
"\"\"\"\n",
"Function to reuse later in other file of the app\n",
"\n",
"X = str(repo_name)\n",
"Y = str(repo_description)\n",
" \n",
"# tokenization \n",
"X_list = word_tokenize(X) \n",
"Y_list = word_tokenize(Y) \n",
" \n",
"# sw contains the list of stopwords \n",
"sw = stopwords.words('english') \n",
"l1 =[];l2 =[] \n",
"\n",
"# remove stop words from the string \n",
"X_set = {w for w in X_list if not w in sw} \n",
"Y_set = {w for w in Y_list if not w in sw} \n",
"\n",
"# form a set containing keywords of both strings \n",
"rvector = X_set.union(Y_set) \n",
"for w in rvector: \n",
" if w in X_set: l1.append(1) # create a vector \n",
" else: l1.append(0) \n",
" if w in Y_set: l2.append(1) \n",
" else: l2.append(0) \n",
"c = 0\n",
"\n",
"# cosine formula \n",
"for i in range(len(rvector)): \n",
" c+= l1[i]*l2[i] \n",
"cosine = c / float((sum(l1)*sum(l2))**0.5) \n",
"print(\"similarity: \", cosine)\"\"\""
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading collection 'all'\n",
"[nltk_data] | \n",
"[nltk_data] | Downloading package abc to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/abc.zip.\n",
"[nltk_data] | Downloading package alpino to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/alpino.zip.\n",
"[nltk_data] | Downloading package biocreative_ppi to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/biocreative_ppi.zip.\n",
"[nltk_data] | Downloading package brown to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/brown.zip.\n",
"[nltk_data] | Downloading package brown_tei to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/brown_tei.zip.\n",
"[nltk_data] | Downloading package cess_cat to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/cess_cat.zip.\n",
"[nltk_data] | Downloading package cess_esp to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/cess_esp.zip.\n",
"[nltk_data] | Downloading package chat80 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/chat80.zip.\n",
"[nltk_data] | Downloading package city_database to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/city_database.zip.\n",
"[nltk_data] | Downloading package cmudict to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/cmudict.zip.\n",
"[nltk_data] | Downloading package comparative_sentences to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/comparative_sentences.zip.\n",
"[nltk_data] | Downloading package comtrans to /root/nltk_data...\n",
"[nltk_data] | Downloading package conll2000 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/conll2000.zip.\n",
"[nltk_data] | Downloading package conll2002 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/conll2002.zip.\n",
"[nltk_data] | Downloading package conll2007 to /root/nltk_data...\n",
"[nltk_data] | Downloading package crubadan to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/crubadan.zip.\n",
"[nltk_data] | Downloading package dependency_treebank to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/dependency_treebank.zip.\n",
"[nltk_data] | Downloading package dolch to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/dolch.zip.\n",
"[nltk_data] | Downloading package europarl_raw to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/europarl_raw.zip.\n",
"[nltk_data] | Downloading package floresta to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/floresta.zip.\n",
"[nltk_data] | Downloading package framenet_v15 to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/framenet_v15.zip.\n",
"[nltk_data] | Downloading package framenet_v17 to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/framenet_v17.zip.\n",
"[nltk_data] | Downloading package gazetteers to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/gazetteers.zip.\n",
"[nltk_data] | Downloading package genesis to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/genesis.zip.\n",
"[nltk_data] | Downloading package gutenberg to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/gutenberg.zip.\n",
"[nltk_data] | Downloading package ieer to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/ieer.zip.\n",
"[nltk_data] | Downloading package inaugural to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/inaugural.zip.\n",
"[nltk_data] | Downloading package indian to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/indian.zip.\n",
"[nltk_data] | Downloading package jeita to /root/nltk_data...\n",
"[nltk_data] | Downloading package kimmo to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/kimmo.zip.\n",
"[nltk_data] | Downloading package knbc to /root/nltk_data...\n",
"[nltk_data] | Downloading package lin_thesaurus to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/lin_thesaurus.zip.\n",
"[nltk_data] | Downloading package mac_morpho to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/mac_morpho.zip.\n",
"[nltk_data] | Downloading package machado to /root/nltk_data...\n",
"[nltk_data] | Downloading package masc_tagged to /root/nltk_data...\n",
"[nltk_data] | Downloading package moses_sample to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping models/moses_sample.zip.\n",
"[nltk_data] | Downloading package movie_reviews to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/movie_reviews.zip.\n",
"[nltk_data] | Downloading package names to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/names.zip.\n",
"[nltk_data] | Downloading package nombank.1.0 to /root/nltk_data...\n",
"[nltk_data] | Downloading package nps_chat to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/nps_chat.zip.\n",
"[nltk_data] | Downloading package omw to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/omw.zip.\n",
"[nltk_data] | Downloading package opinion_lexicon to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/opinion_lexicon.zip.\n",
"[nltk_data] | Downloading package paradigms to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/paradigms.zip.\n",
"[nltk_data] | Downloading package pil to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/pil.zip.\n",
"[nltk_data] | Downloading package pl196x to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/pl196x.zip.\n",
"[nltk_data] | Downloading package ppattach to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/ppattach.zip.\n",
"[nltk_data] | Downloading package problem_reports to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/problem_reports.zip.\n",
"[nltk_data] | Downloading package propbank to /root/nltk_data...\n",
"[nltk_data] | Downloading package ptb to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/ptb.zip.\n",
"[nltk_data] | Downloading package product_reviews_1 to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/product_reviews_1.zip.\n",
"[nltk_data] | Downloading package product_reviews_2 to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/product_reviews_2.zip.\n",
"[nltk_data] | Downloading package pros_cons to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/pros_cons.zip.\n",
"[nltk_data] | Downloading package qc to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/qc.zip.\n",
"[nltk_data] | Downloading package reuters to /root/nltk_data...\n",
"[nltk_data] | Downloading package rte to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/rte.zip.\n",
"[nltk_data] | Downloading package semcor to /root/nltk_data...\n",
"[nltk_data] | Downloading package senseval to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/senseval.zip.\n",
"[nltk_data] | Downloading package sentiwordnet to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/sentiwordnet.zip.\n",
"[nltk_data] | Downloading package sentence_polarity to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/sentence_polarity.zip.\n",
"[nltk_data] | Downloading package shakespeare to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/shakespeare.zip.\n",
"[nltk_data] | Downloading package sinica_treebank to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/sinica_treebank.zip.\n",
"[nltk_data] | Downloading package smultron to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/smultron.zip.\n",
"[nltk_data] | Downloading package state_union to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/state_union.zip.\n",
"[nltk_data] | Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/stopwords.zip.\n",
"[nltk_data] | Downloading package subjectivity to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/subjectivity.zip.\n",
"[nltk_data] | Downloading package swadesh to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/swadesh.zip.\n",
"[nltk_data] | Downloading package switchboard to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/switchboard.zip.\n",
"[nltk_data] | Downloading package timit to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/timit.zip.\n",
"[nltk_data] | Downloading package toolbox to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/toolbox.zip.\n",
"[nltk_data] | Downloading package treebank to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/treebank.zip.\n",
"[nltk_data] | Downloading package twitter_samples to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/twitter_samples.zip.\n",
"[nltk_data] | Downloading package udhr to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/udhr.zip.\n",
"[nltk_data] | Downloading package udhr2 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/udhr2.zip.\n",
"[nltk_data] | Downloading package unicode_samples to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/unicode_samples.zip.\n",
"[nltk_data] | Downloading package universal_treebanks_v20 to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Downloading package verbnet to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/verbnet.zip.\n",
"[nltk_data] | Downloading package verbnet3 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/verbnet3.zip.\n",
"[nltk_data] | Downloading package webtext to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/webtext.zip.\n",
"[nltk_data] | Downloading package wordnet to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/wordnet.zip.\n",
"[nltk_data] | Downloading package wordnet_ic to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/wordnet_ic.zip.\n",
"[nltk_data] | Downloading package words to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/words.zip.\n",
"[nltk_data] | Downloading package ycoe to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/ycoe.zip.\n",
"[nltk_data] | Downloading package rslp to /root/nltk_data...\n",
"[nltk_data] | Unzipping stemmers/rslp.zip.\n",
"[nltk_data] | Downloading package maxent_treebank_pos_tagger to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping taggers/maxent_treebank_pos_tagger.zip.\n",
"[nltk_data] | Downloading package universal_tagset to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping taggers/universal_tagset.zip.\n",
"[nltk_data] | Downloading package maxent_ne_chunker to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip.\n",
"[nltk_data] | Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] | Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] | Downloading package book_grammars to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping grammars/book_grammars.zip.\n",
"[nltk_data] | Downloading package sample_grammars to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping grammars/sample_grammars.zip.\n",
"[nltk_data] | Downloading package spanish_grammars to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping grammars/spanish_grammars.zip.\n",
"[nltk_data] | Downloading package basque_grammars to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping grammars/basque_grammars.zip.\n",
"[nltk_data] | Downloading package large_grammars to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping grammars/large_grammars.zip.\n",
"[nltk_data] | Downloading package tagsets to /root/nltk_data...\n",
"[nltk_data] | Unzipping help/tagsets.zip.\n",
"[nltk_data] | Downloading package snowball_data to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Downloading package bllip_wsj_no_aux to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping models/bllip_wsj_no_aux.zip.\n",
"[nltk_data] | Downloading package word2vec_sample to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping models/word2vec_sample.zip.\n",
"[nltk_data] | Downloading package panlex_swadesh to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Downloading package mte_teip5 to /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/mte_teip5.zip.\n",
"[nltk_data] | Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip.\n",
"[nltk_data] | Downloading package averaged_perceptron_tagger_ru to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping\n",
"[nltk_data] | taggers/averaged_perceptron_tagger_ru.zip.\n",
"[nltk_data] | Downloading package perluniprops to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping misc/perluniprops.zip.\n",
"[nltk_data] | Downloading package nonbreaking_prefixes to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Unzipping corpora/nonbreaking_prefixes.zip.\n",
"[nltk_data] | Downloading package vader_lexicon to\n",
"[nltk_data] | /root/nltk_data...\n",
"[nltk_data] | Downloading package porter_test to /root/nltk_data...\n",
"[nltk_data] | Unzipping stemmers/porter_test.zip.\n",
"[nltk_data] | Downloading package wmt15_eval to /root/nltk_data...\n",
"[nltk_data] | Unzipping models/wmt15_eval.zip.\n",
"[nltk_data] | Downloading package mwa_ppdb to /root/nltk_data...\n",
"[nltk_data] | Unzipping misc/mwa_ppdb.zip.\n",
"[nltk_data] | \n",
"[nltk_data] Done downloading collection all\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'\\nX = str(repo_name)\\nY = str(repo_description)\\n \\n# tokenization \\nX_list = word_tokenize(X) \\nY_list = word_tokenize(Y) \\n \\n# sw contains the list of stopwords \\nsw = stopwords.words(\\'english\\') \\nl1 =[];l2 =[] \\n\\n# remove stop words from the string \\nX_set = {w for w in X_list if not w in sw} \\nY_set = {w for w in Y_list if not w in sw} \\n\\n# form a set containing keywords of both strings \\nrvector = X_set.union(Y_set) \\nfor w in rvector: \\n if w in X_set: l1.append(1) # create a vector \\n else: l1.append(0) \\n if w in Y_set: l2.append(1) \\n else: l2.append(0) \\nc = 0\\n\\n# cosine formula \\nfor i in range(len(rvector)): \\n c+= l1[i]*l2[i] \\ncosine = c / float((sum(l1)*sum(l2))**0.5) \\nprint(\"similarity: \", cosine)'"
]
},
"metadata": {
"tags": []
},
"execution_count": 1
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zClyUdB3HPqx",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"outputId": "77d8d5ef-aabd-45eb-ea06-f6bbc505e2db"
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.porter import PorterStemmer\n",
"import difflib\n",
"corpus = []\n",
"\n",
"text = [\"Garbelini/sweyntooth_bluetooth_low_energy_attacks\",\"In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the eastern side of South America\"]\n",
"x= []\n",
"for i in range(0,2):\n",
" review = re.sub('[^a-zA-Z]', ' ' ,text[i])\n",
" review = review.lower()\n",
" review = review.split()\n",
" ps = PorterStemmer()\n",
" review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n",
" review = ' '.join(review)\n",
" corpus.append(review)\n",
" x.append(corpus[i].split())\n",
"\n",
"print(x[1])\n",
"sm=difflib.SequenceMatcher(None,x[0],x[1])\n",
"sm.ratio()\n"
],
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"['brazil', 'drive', 'right', 'hand', 'side', 'road', 'brazil', 'larg', 'coastlin', 'eastern', 'side', 'south', 'america']\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.0"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment