Déborah Mesquita dmesquita

## experiment.py
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import RandomizedSearchCV

from classify_documents_model.pipeline import text_clf

categories = ['talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc']

twenty_train = fetch_20newsgroups(subset='train',

## pipeline.py
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from .transformers import TextTransformer

text_clf = Pipeline([
    ('tokens', TextTransformer("en_core_web_sm")),
    ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False)),
    ('clf', RandomForestClassifier(n_estimators=100)),

## transformers.py
import spacy

from sklearn.base import BaseEstimator, TransformerMixin

class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp_model="en", lemmatization=True, remove_stopwords=True) -> None:
        self.nlp_model = nlp_model
        self.lemmatization = lemmatization
        self.remove_stopwords = remove_stopwords

## visualização_processos.json
{
  "$scema": "https://vega.github.io/schema/vega/v5.json",
  "width": 1200,
  "height": 500,
  "padding": 0,
  "autosize": {"type":"pad", "contains":"padding"},
  "config": {
    "text": {
      "font": "Ideal Sans, Avenir Next, Helvetica"
    },

## data_movimentacoes.csv

          
            movimentacao
            data
            numero
            pagina

            
              DATA_PETICAO_INICIAL
              19/09/2018
              0047317-30.2018.8.17.2001
              1

            
              DATA_PETICAO_INICIAL
              19/09/2018
              0047317-30.2018.8.17.2001
              1

            
              DATA_DECISAO
              01/10/2018
              0047317-30.2018.8.17.2001
              1

            
              DATA_CERTIDAO
              05/10/2018
              0047317-30.2018.8.17.2001
              1

            
              DATA_CERTIDAO
              08/10/2018
              0047317-30.2018.8.17.2001
              1

            
              DATA_PETICAO_INICIAL
              21/11/2016
              0000007-44.2016.8.17.2180
              1

            
              DATA_CERTIDAO
              21/11/2016
              0000007-44.2016.8.17.2180
              1

            
              DATA_CERTIDAO
              21/11/2016
              0000007-44.2016.8.17.2180
              1

            
              DATA_DESPACHO
              29/11/2016
              0000007-44.2016.8.17.2180
              1

## data.csv
promotor,assunto,vara,classe,justica_gratuita,segredo_justica,valor,numero,total_paginas,data,dificuldade_movimentacoes,dificuldade_calculada
7º Promotor de Justiça Cível da Capital,Oferta,Juizado Informal de Família,ALIMENTOS - LEI ESPECIAL Nº 5.478/68,True,True,-1.0,0047317-30.2018.8.17.2001,21,08/10/2018,17,2.9014215940827497
Promotor de Justiça de Altinho,Retificação de Nome,Vara Única da Comarca de Altinho,RETIFICAÇÃO OU SUPRIMENTO OU RESTAURAÇÃO DE REGISTRO CIVIL,True,True,-1.0,0000007-44.2016.8.17.2180,22,09/08/2017,20,3.0252910757955354
2º Promotor de Justiça de Gravatá,Guarda,2ª Vara Cível da Comarca de Gravatá,GUARDA,True,True,-1.0,0001103-11.2018.8.17.2670,21,11/09/2018,7,2.4159137783010487
7ª Procuradoria de Justiça Cível,"Indenização por Dano Moral, Indenização por Dano Material",1º Gabinete da Turma Estadual de Uniformização,RECLAMAÇÃO,False,False,-1.0,0000155-19.2017.8.17.9003,184,03/09/2018,18,4.216562194946349
5º Promotor de Justiça Cível de Olinda,Regulamentação de Visitas,3ª Vara de Família

## tree.json
[
  {
    "id": 1,
    "name": "Ensino e Aprendizagem de Introdução à Programação"
  },
  {
    "id": 2,
    "name": "Dificuldades dos estudantes",
    "parent": 1
  },

## data.json
{"nodes": [{"group": "humanas", "index": 0, "name": "ADMINISTRAÇÃO"},
 {"group": "humanas", "index": 1, "name": "ANTROPOLOGIA"},
 {"group": "humanas", "index": 2, "name": "ARQUEOLOGIA"},
 {"group": "biologicas", "index": 3, "name": "BIOLOGIA ANIMAL"},
 {"group": "saude", "index": 4, "name": "BIOLOGIA APLICADA À SAÚDE"},
 {"group": "biologicas", "index": 5, "name": "BIOLOGIA VEGETAL"},
 {"group": "biologicas", "index": 6, "name": "BIOQUÍMICA E FISIOLOGIA"},
 {"group": "exatas", "index": 7, "name": "BIOTECNOLOGIA INDUSTRIAL"},
 {"group": "saude", "index": 8, "name": "CIRURGIA"},
 {"group": "exatas", "index": 9, "name": "CIÊNCIA DA COMPUTAÇÃO"},

## data_ufpe.json
{"nodes": [{"group": "humanas", "index": 0, "name": "ADMINISTRAÇÃO"},
 {"group": "humanas", "index": 1, "name": "ANTROPOLOGIA"},
 {"group": "humanas", "index": 2, "name": "ARQUEOLOGIA"},
 {"group": "biologicas", "index": 3, "name": "BIOLOGIA ANIMAL"},
 {"group": "saude", "index": 4, "name": "BIOLOGIA APLICADA À SAÚDE"},
 {"group": "biologicas", "index": 5, "name": "BIOLOGIA VEGETAL"},
 {"group": "biologicas", "index": 6, "name": "BIOQUÍMICA E FISIOLOGIA"},
 {"group": "exatas", "index": 7, "name": "BIOTECNOLOGIA INDUSTRIAL"},
 {"group": "saude", "index": 8, "name": "CIRURGIA"},
 {"group": "exatas", "index": 9, "name": "CIÊNCIA DA COMPUTAÇÃO"},

## spider.py
class MySpider(scrapy.Spider):
    name = 'myspider'

    # All dissertations by issued date
    start_urls = ['http://www.repositorio.ufpe.br/handle/123456789/50/browse?type=dateissued']

    def parse(self, response):
        # follow links to dissertation pages
        for href in response.css('.artifact-title > a::attr(href)'):
            yield response.follow('http://www.repositorio.ufpe.br'+href.extract(), self.parse_dissertation)
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.model_selection import RandomizedSearchCV

	from classify_documents_model.pipeline import text_clf

	categories = ['talk.politics.guns',
	'talk.politics.mideast',
	'talk.politics.misc']

	twenty_train = fetch_20newsgroups(subset='train',
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestClassifier

	from .transformers import TextTransformer

	text_clf = Pipeline([
	('tokens', TextTransformer("en_core_web_sm")),
	('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False)),
	('clf', RandomForestClassifier(n_estimators=100)),
	import spacy

	from sklearn.base import BaseEstimator, TransformerMixin

	class TextTransformer(BaseEstimator, TransformerMixin):
	def __init__(self, nlp_model="en", lemmatization=True, remove_stopwords=True) -> None:
	self.nlp_model = nlp_model
	self.lemmatization = lemmatization
	self.remove_stopwords = remove_stopwords
	{
	"$scema": "https://vega.github.io/schema/vega/v5.json",
	"width": 1200,
	"height": 500,
	"padding": 0,
	"autosize": {"type":"pad", "contains":"padding"},
	"config": {
	"text": {
	"font": "Ideal Sans, Avenir Next, Helvetica"
	},
movimentacao	data	numero	pagina
DATA_PETICAO_INICIAL	19/09/2018	0047317-30.2018.8.17.2001	1
DATA_PETICAO_INICIAL	19/09/2018	0047317-30.2018.8.17.2001	1
DATA_DECISAO	01/10/2018	0047317-30.2018.8.17.2001	1
DATA_CERTIDAO	05/10/2018	0047317-30.2018.8.17.2001	1
DATA_CERTIDAO	08/10/2018	0047317-30.2018.8.17.2001	1
DATA_PETICAO_INICIAL	21/11/2016	0000007-44.2016.8.17.2180	1
DATA_CERTIDAO	21/11/2016	0000007-44.2016.8.17.2180	1
DATA_CERTIDAO	21/11/2016	0000007-44.2016.8.17.2180	1
DATA_DESPACHO	29/11/2016	0000007-44.2016.8.17.2180	1
	promotor,assunto,vara,classe,justica_gratuita,segredo_justica,valor,numero,total_paginas,data,dificuldade_movimentacoes,dificuldade_calculada
	7º Promotor de Justiça Cível da Capital,Oferta,Juizado Informal de Família,ALIMENTOS - LEI ESPECIAL Nº 5.478/68,True,True,-1.0,0047317-30.2018.8.17.2001,21,08/10/2018,17,2.9014215940827497
	Promotor de Justiça de Altinho,Retificação de Nome,Vara Única da Comarca de Altinho,RETIFICAÇÃO OU SUPRIMENTO OU RESTAURAÇÃO DE REGISTRO CIVIL,True,True,-1.0,0000007-44.2016.8.17.2180,22,09/08/2017,20,3.0252910757955354
	2º Promotor de Justiça de Gravatá,Guarda,2ª Vara Cível da Comarca de Gravatá,GUARDA,True,True,-1.0,0001103-11.2018.8.17.2670,21,11/09/2018,7,2.4159137783010487
	7ª Procuradoria de Justiça Cível,"Indenização por Dano Moral, Indenização por Dano Material",1º Gabinete da Turma Estadual de Uniformização,RECLAMAÇÃO,False,False,-1.0,0000155-19.2017.8.17.9003,184,03/09/2018,18,4.216562194946349
	5º Promotor de Justiça Cível de Olinda,Regulamentação de Visitas,3ª Vara de Família
	[
	{
	"id": 1,
	"name": "Ensino e Aprendizagem de Introdução à Programação"
	},
	{
	"id": 2,
	"name": "Dificuldades dos estudantes",
	"parent": 1
	},
	{"nodes": [{"group": "humanas", "index": 0, "name": "ADMINISTRAÇÃO"},
	{"group": "humanas", "index": 1, "name": "ANTROPOLOGIA"},
	{"group": "humanas", "index": 2, "name": "ARQUEOLOGIA"},
	{"group": "biologicas", "index": 3, "name": "BIOLOGIA ANIMAL"},
	{"group": "saude", "index": 4, "name": "BIOLOGIA APLICADA À SAÚDE"},
	{"group": "biologicas", "index": 5, "name": "BIOLOGIA VEGETAL"},
	{"group": "biologicas", "index": 6, "name": "BIOQUÍMICA E FISIOLOGIA"},
	{"group": "exatas", "index": 7, "name": "BIOTECNOLOGIA INDUSTRIAL"},
	{"group": "saude", "index": 8, "name": "CIRURGIA"},
	{"group": "exatas", "index": 9, "name": "CIÊNCIA DA COMPUTAÇÃO"},
	class MySpider(scrapy.Spider):
	name = 'myspider'

	# All dissertations by issued date
	start_urls = ['http://www.repositorio.ufpe.br/handle/123456789/50/browse?type=dateissued']

	def parse(self, response):
	# follow links to dissertation pages
	for href in response.css('.artifact-title > a::attr(href)'):
	yield response.follow('http://www.repositorio.ufpe.br'+href.extract(), self.parse_dissertation)