Skip to content

Instantly share code, notes, and snippets.

@insekticid
Created July 24, 2020 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save insekticid/452fc7e5619d3a2dff2d9c6d924c50b9 to your computer and use it in GitHub Desktop.
Save insekticid/452fc7e5619d3a2dff2d9c6d924c50b9 to your computer and use it in GitHub Desktop.
import sitemapModule from 'sitemap';
import axios from 'axios';
const { buildSitemapIndex, createSitemapsAndIndex } = sitemapModule;
let elasticsearch = axios.create({
baseURL: 'http://' + process.env.ELASTIC_SERVER + ':' + process.env.ELASTIC_PORT,
responseType: 'json'
});
elasticsearch.interceptors.request.use(request => {
//console.log('Starting Request', request)
return request
})
elasticsearch.interceptors.response.use(response => {
//console.log('Response:', response)
return response
})
let elasticData = (field) => {
return {
"_source": {
"includes": [ "name", "url" ]
},
"size": 0,
"aggs": {
"group_by_state": {
"terms": {
"field": field,
"size": 500000
}
}
}
}
}
const sitemapPaths = [];
let createIndex = (urls) => createSitemapsAndIndex({
urls: urls,
lastmod: new Date().toISOString(),
targetFolder: 'sitemap-search',
hostname: process.env.SITEMAP_URL,
cacheTime: 600,
sitemapName: 'sitemap',
sitemapSize: 10000, // number of urls to allow in each sitemap
gzip: true, // whether to gzip the files
})
let prepareUrl = (bucket) => {
sitemapPaths.push( { url: process.env.SITEMAP_KEY_URL + bucket.key });
}
let getData = async() => {
await elasticsearch.post(process.env.ELASTIC_INDEX + '/_search', elasticData('name.keyword'))
.then(async response =>
{
let buckets = response.data.aggregations.group_by_state.buckets;
await Promise.all(buckets.map(prepareUrl));
console.log('done keyword', buckets.length, sitemapPaths.length)
})
await elasticsearch.post(process.env.ELASTIC_INDEX + '/_search', elasticData('name.analyzed'))
.then(async response =>
{
let buckets = response.data.aggregations.group_by_state.buckets;
await Promise.all(buckets.map(prepareUrl));
console.log('done analyzed', buckets.length, sitemapPaths.length)
})
await createIndex(sitemapPaths);
console.log('done all', sitemapPaths.length)
}
getData()
@insekticid
Copy link
Author

fos_elastica:
  indexes:
    app:
      types:
        content:
          properties:
            name:
              type: "text"
              analyzer: "czech"
              fields:
                keyword:
                  type: "keyword"
                  normalizer: remove_quotes
                sitemap:
                  type: "text"
                  fielddata: true
                analyzed:
                  type: "text"
                  analyzer: whitespacer_bigger_one
                  fielddata: true
            url: ~
            picture: ~
            pictureCaption:
      settings:
        index:
          analysis:
            char_filter:
              space_remover:
                type: pattern_replace
                pattern: "\\s+"
                replacement: "+"
              special_chars_remover:
                type: pattern_replace
                pattern: "[^a-zA-Z0-9À-ž]"
                replacement: " "
              trimmer:
                type: pattern_replace
                pattern: "^\\s+|\\s+$"
                replacement: ""
            normalizer:
              remove_quotes:
                type: custom
                char_filter: [special_chars_remover, trimmer, space_remover]
                filter: [lowercase]
            analyzer:
              czech:
                type: custom
                tokenizer: standard
                filter: [czech_hunspell, lowercase, icu_folding, unique_on_same_position]
              whitespacer_bigger_one:
                type: custom
                tokenizer: standard
                normalizer: remove_quotes
                filter: [bigger_than_one, word_delimiter, czech_hunspell, lowercase]
            filter:
              czech_hunspell:
                type: hunspell
                locale: cs_CZ
              unique_on_same_position:
                type: unique
                only_on_same_position: TRUE
              bigger_than_one:
                type: length
                min: 2

@insekticid
Copy link
Author

insekticid commented Jul 24, 2020

POST content/_search
{
  "size": 0,
  "aggs": {
    "group_by_state": {
      "terms": {
        "field": "name.analyzed",
        "size": 1000
      }
    }
  }
}

{
  "took" : 548,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 104701,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "group_by_state" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "salát",
          "doc_count" : 5365
        },
        {
          "key" : "kuřecí",
          "doc_count" : 5178
        },
        {
          "key" : "polévka",
          "doc_count" : 4902
        },
        {
          "key" : "pečený",
          "doc_count" : 4086
        },
        {
          "key" : "koláč",
          "doc_count" : 3888
        },
        {
          "key" : "omáčka",
          "doc_count" : 3833
        },
        {
          "key" : "vepřový",
          "doc_count" : 3155
        },
        {
          "key" : "maso",
          "doc_count" : 3097
        },

GET /recipe,recipes/_search
{
  "_source": {
    "includes": [ "name", "url" ]
  },
  "size": 0,
  "aggs": {
    "group_by_state": {
      "terms": {
        "field": "name.keyword",
        "size": 3000
      }
    }
  }
}


{
  "took" : 3137,
  "timed_out" : false,
  "_shards" : {
    "total" : 10,
    "successful" : 10,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 125593,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "group_by_state" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "perníčky",
          "doc_count" : 23
        },
        {
          "key" : "vanilkové+rohlíčky",
          "doc_count" : 23
        },
        {
          "key" : "tiramisu",
          "doc_count" : 22
        },
        {
          "key" : "kuře+na+paprice",
          "doc_count" : 16
        },
        {
          "key" : "čokoládová+pěna",
          "doc_count" : 16
        },
        {
          "key" : "bramborové+knedlíky",
          "doc_count" : 15
        },
        {
          "key" : "polévka+z+červené+čočky",
          "doc_count" : 15
        },
        {
          "key" : "vánočka",
          "doc_count" : 15
        },
        {
          "key" : "brokolicová+polévka",
          "doc_count" : 14
        },


@insekticid
Copy link
Author

GET /recipe,recipes/_search
{
    "_source": {
    "includes": [ "name", "url" ]
  },
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "name": {
                            "query": "svarak",
                            "fuzziness": 1
                        }
                    }
                }
            ]
        }
    },
    "indices_boost": [
        {
            "recipe": 1.8
        }
    ],
    "from": 0,
    "size": 48
}
{
  "took" : 9,
  "timed_out" : false,
  "_shards" : {
    "total" : 10,
    "successful" : 10,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 20,
    "max_score" : 17.028666,
    "hits" : [
      {
        "_index" : "recipe",
        "_type" : "recipe",
        "_id" : "4278",
        "_score" : 17.028666,
        "_source" : {
          "name" : "Medový svařák",
          "url" : "https://www.recepty.eu/teple-napoje/medovy-svarak.html"
        }
      },
      {
        "_index" : "recipe",
        "_type" : "recipe",
        "_id" : "15094",
        "_score" : 14.939789,
        "_source" : {
          "name" : "Svařák s jablkem a hruškou",
          "url" : "https://www.recepty.eu/teple-napoje/svarak-s-jablkem-a-hruskou.html"
        }
      },

@insekticid
Copy link
Author

insekticid commented Jul 24, 2020

<?php

declare(strict_types=1);

/*
 * This file is part of Recepty.eu project
 * (c) Exploit.cz <insekticid@exploit.cz>
 * (c) Recepty.eu <info@recepty.eu>
 *
 * This source file is subject to the proprietary license.
 */

namespace App\Repository;

use App\Entity\Category;

use Elastica\Query;
use Elastica\Query\BoolQuery;
use Elastica\Query\Match;
use Elastica\Query\Terms;
use FOS\ElasticaBundle\Repository;

class SearchRepository extends Repository
{
    use PagingTrait;

    public function search(string $searchTerm, ?Category $category = null, int $page = 1, int $limit = 48) : ?array
    {
        if ($searchTerm) {
            $boolQuery = $this->baseQuery($searchTerm, $category);

            $query = Query::create($boolQuery);
            $query->setParam('indices_boost', [['recipe' => 1.8]]);

            $items = $this->findPaginated($query);
            $items->setMaxPerPage($limit);
            $items->setCurrentPage($page);

            return ['items' => $items, 'searchTerm' => $searchTerm];
        }

        return null;
    }

    protected function baseQuery(string $searchTerm, ?Category $category = null) : BoolQuery
    {
        $boolQuery = new BoolQuery();

        $fieldQuery = new Match();
        $fieldQuery->setFieldQuery('name', $searchTerm);
        $fieldQuery->setFieldFuzziness('name', 1);
        $boolQuery->addShould($fieldQuery);

        if ($category) {
            $categoryQuery = new Terms();
            $categoryQuery->setTerms('category', [$category->getId()]);
            $boolQuery->addMust($categoryQuery);
        }

        return $boolQuery;
    }
}
<?php

declare(strict_types=1);

/*
 * This file is part of Recepty.eu project
 * (c) Exploit.cz <insekticid@exploit.cz>
 * (c) Recepty.eu <info@recepty.eu>
 *
 * This source file is subject to the proprietary license.
 */

namespace App\Repository;

use App\Util\LimitedPagerfanta;
use Doctrine\ORM\Query;
use Doctrine\ORM\QueryBuilder;
use Pagerfanta\Adapter\DoctrineORMAdapter;
use Pagerfanta\Pagerfanta;

trait PagingTrait
{
    protected function createPaginator(Query $query, int $limit, int $page, ?int $limitedPageNumber = null) : Pagerfanta
    {
        $adapter = new DoctrineORMAdapter($query);

        if ($limitedPageNumber) {
            $pager = new LimitedPagerfanta($adapter);
            $pager->setLimitedPageNumber($limitedPageNumber);
        } else {
            $pager = new Pagerfanta($adapter);
        }

        $pager->setMaxPerPage($limit);
        $pager->setCurrentPage($page);

        return $pager;
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment