CharlesNepote/checkbot.py

## checkbot.py
import sys   # to allow inputs from command line
import simplejson as json  # to read json data; simplejson is easier to read nested data
import urllib.request # to make http request


# What's the issue?
# 1. A lot of data quality issues are not solved or owned.
# 2. The current lists are not efficient because they are mostly unknown by the community and we have to look at it to see what have to be done.

# The goal of this tool is to push quality errors to some volonteers.
# It does not replace current lists.
# This tool is inspired by https://www.codetriage.com/

# 1. It must push different issues to diffetent people to ensure that the work is well distributed.
# 1b. It could favoritize some issues. Or not.
# 2. It should not bother people with too many sollicitations if people don't answer. It could allow people to make a pause.
# 3. It should not bother people with too many missions at a time.
# 4. It shall let people see the ongoing effort of the community: number of participants, etc.
# 5. It must use as few resources as possible.

# Context
# Open Food Facts identifies data quality errors:
# https://world.openfoodfacts.org/data-quality-errors
# https://world.openfoodfacts.org/data-quality-errors&json=1
# For a product having an identified error, the API will answer a table of all issues as follow:
# data_quality_errors_tags: [
#    "en:nutrition-value-over-105-sodium",
#    "en:nutrition-value-over-105-salt",
#    "en:nutrition-value-total-over-105"
# ]
# To extract these errors:
# $ zcat openfoodfacts-products.jsonl.gz | head -n 50000 | \
#   jq -r '. | select(.data_quality_errors_tags[]? != "") | [.code,.data_quality_errors_tags] | flatten | @csv' > errors.world.csv
#
# $ time zcat openfoodfacts-products.jsonl.gz | jq -r '. | select(.data_quality_errors_tags[]? != "") | [.code,.data_quality_errors_tags]' > errors.world.json
# real	8m12,954s
# user	9m42,042s
# sys	0m14,459s
# $ ll -h errors*
# -rw-rw-r-- 1 charles charles 2,0M juil. 21 18:11 errors.world.json

# ~20000 products with data quality errors: 80 days for 83 users with an average of 3 fix per day each.


# How it works.
# 1. Everyday, download all data and filter quality errors in a database (json file?)
# * flag all products that has been fixed
# * add all new products
# 2. For each user who have subscribed, send him X products randomly
# * iterate users:
#   * gather the way they want to be alerted and the number of products they want to fix each time
#   * don't send anything if they are in pause or if they did not fix any past suggestions from X days
#   * else select and send them X products randomly
#     * products that have not been sent to other users
#     * products that have been sent a long time ago (xxx days?)
#     * products that are in the list generated last night
#   * save the user and date of the send
# 3.

# Alert example:
# Email subject (if an email): Open Food Facts quality community alert
# Hi username, here are 5 issues you might want to fix:
# 1. ...
# 2. ...
#
# Statistics:
# * 37 users are registered to the OFF-quality community (including you!)
# * 19,559 products has a quality error, representing 1.065% of all products.
# * xx products have been fixed since yesterday
# * if every user fix its own suggested list, it will take 87 days to get 0 data quality error
#
# I want more products to fix now! => it's here.
#
# Change your options:
# * change number of products or alert frequency
# * pause for some time
# * unsubscribe

#%%
print("Hello world")

#%%
# 1. read inputs from command line or somewhere else


#%%
# 2. initiate default values: number of issues
max_issues = 5 if max_issues is not None else max_issues


#%%
# 3. read json file, choose random page(s) and random products

# * Read json
#   https://world.openfoodfacts.org/data-quality-error/nutrition-value-over-3800-energy&json=1
#contents = urllib.request.urlopen("http://example.com/foo/bar").read()

# Parse json
jsonstring = """
    {
        "count": 5851,
        "page": 1,
        "page_count": 24,
        "page_size": 24,
        "products": [
            { }
        ]
    }
    """
json_dict = json.loads(jsonstring)

print("JSON string = ", json_dict)

# * select X products randomly
def get_products(json_dict, nb=5):
    """get list of products related to a particular issue

    Parameters
    ----------
    json_dict : dict
        The json object containing all products related to a particular issue.
    nb : int
        Number of products to return

    Returns
    -------
    list
        A list of barcodes
    """
    # * Read the number of pages
    #   page_count
    print (json_dict["products"]["id"])

    # Start with the simple case: where the number of products requested is far fewer than products counted

    if(json_dict["count"] <= nb):
        print (json_dict["products"]["id"])
        return json_dict["products"]["id"]

    if(json_dict["count"] >= nb):
        # eg. nb =  5, count = 20, page_size = 24, page_count =  1 => list = [1]
        # eg. nb = 26, count = 32, page_size = 24, page_count =  2 => list = [1,2]
        # eg. nb = 10, count = 50, page_size =  5, page_count = 10 => list = [3,7]
        # random pages
        nb_of_pages = json_dict["page_count"]
        # list of pages = number of pages * page size / nb

    #page_count =
    print('Number of pages = ', json_dict["page_count"])


#%%
# 4. send data


# 5. save data to be checked
	import sys # to allow inputs from command line
	import simplejson as json # to read json data; simplejson is easier to read nested data
	import urllib.request # to make http request


	# What's the issue?
	# 1. A lot of data quality issues are not solved or owned.
	# 2. The current lists are not efficient because they are mostly unknown by the community and we have to look at it to see what have to be done.

	# The goal of this tool is to push quality errors to some volonteers.
	# It does not replace current lists.
	# This tool is inspired by https://www.codetriage.com/

	# 1. It must push different issues to diffetent people to ensure that the work is well distributed.
	# 1b. It could favoritize some issues. Or not.
	# 2. It should not bother people with too many sollicitations if people don't answer. It could allow people to make a pause.
	# 3. It should not bother people with too many missions at a time.
	# 4. It shall let people see the ongoing effort of the community: number of participants, etc.
	# 5. It must use as few resources as possible.

	# Context
	# Open Food Facts identifies data quality errors:
	# https://world.openfoodfacts.org/data-quality-errors
	# https://world.openfoodfacts.org/data-quality-errors&json=1
	# For a product having an identified error, the API will answer a table of all issues as follow:
	# data_quality_errors_tags: [
	# "en:nutrition-value-over-105-sodium",
	# "en:nutrition-value-over-105-salt",
	# "en:nutrition-value-total-over-105"
	# ]
	# To extract these errors:
	# $ zcat openfoodfacts-products.jsonl.gz \| head -n 50000 \| \
	# jq -r '. \| select(.data_quality_errors_tags[]? != "") \| [.code,.data_quality_errors_tags] \| flatten \| @csv' > errors.world.csv
	#
	# $ time zcat openfoodfacts-products.jsonl.gz \| jq -r '. \| select(.data_quality_errors_tags[]? != "") \| [.code,.data_quality_errors_tags]' > errors.world.json
	# real 8m12,954s
	# user 9m42,042s
	# sys 0m14,459s
	# $ ll -h errors*
	# -rw-rw-r-- 1 charles charles 2,0M juil. 21 18:11 errors.world.json

	# ~20000 products with data quality errors: 80 days for 83 users with an average of 3 fix per day each.


	# How it works.
	# 1. Everyday, download all data and filter quality errors in a database (json file?)
	# * flag all products that has been fixed
	# * add all new products
	# 2. For each user who have subscribed, send him X products randomly
	# * iterate users:
	# * gather the way they want to be alerted and the number of products they want to fix each time
	# * don't send anything if they are in pause or if they did not fix any past suggestions from X days
	# * else select and send them X products randomly
	# * products that have not been sent to other users
	# * products that have been sent a long time ago (xxx days?)
	# * products that are in the list generated last night
	# * save the user and date of the send
	# 3.

	# Alert example:
	# Email subject (if an email): Open Food Facts quality community alert
	# Hi username, here are 5 issues you might want to fix:
	# 1. ...
	# 2. ...
	#
	# Statistics:
	# * 37 users are registered to the OFF-quality community (including you!)
	# * 19,559 products has a quality error, representing 1.065% of all products.
	# * xx products have been fixed since yesterday
	# * if every user fix its own suggested list, it will take 87 days to get 0 data quality error
	#
	# I want more products to fix now! => it's here.
	#
	# Change your options:
	# * change number of products or alert frequency
	# * pause for some time
	# * unsubscribe

	#%%
	print("Hello world")

	#%%
	# 1. read inputs from command line or somewhere else


	#%%
	# 2. initiate default values: number of issues
	max_issues = 5 if max_issues is not None else max_issues


	#%%
	# 3. read json file, choose random page(s) and random products

	# * Read json
	# https://world.openfoodfacts.org/data-quality-error/nutrition-value-over-3800-energy&json=1
	#contents = urllib.request.urlopen("http://example.com/foo/bar").read()

	# Parse json
	jsonstring = """
	{
	"count": 5851,
	"page": 1,
	"page_count": 24,
	"page_size": 24,
	"products": [
	{ }
	]
	}
	"""
	json_dict = json.loads(jsonstring)

	print("JSON string = ", json_dict)

	# * select X products randomly
	def get_products(json_dict, nb=5):
	"""get list of products related to a particular issue

	Parameters
	----------
	json_dict : dict
	The json object containing all products related to a particular issue.
	nb : int
	Number of products to return

	Returns
	-------
	list
	A list of barcodes
	"""
	# * Read the number of pages
	# page_count
	print (json_dict["products"]["id"])

	# Start with the simple case: where the number of products requested is far fewer than products counted

	if(json_dict["count"] <= nb):
	print (json_dict["products"]["id"])
	return json_dict["products"]["id"]

	if(json_dict["count"] >= nb):
	# eg. nb = 5, count = 20, page_size = 24, page_count = 1 => list = [1]
	# eg. nb = 26, count = 32, page_size = 24, page_count = 2 => list = [1,2]
	# eg. nb = 10, count = 50, page_size = 5, page_count = 10 => list = [3,7]
	# random pages
	nb_of_pages = json_dict["page_count"]
	# list of pages = number of pages * page size / nb

	#page_count =
	print('Number of pages = ', json_dict["page_count"])




	#%%
	# 4. send data


	# 5. save data to be checked