veryeli/basic_stats_worked_example.sql

## basic_stats_worked_example.sql
SELECT top_alternative AS alternative,
       ROUND(NTH(500, QUANTILES(ud_points, 1001)), 2) AS median_points,
       ROUND(avg(ud_points),2) AS mean_points,
       ROUND(NTH(500, QUANTILES(ud_weekly_problem_days, 1001)),2) AS median_weekly_days,
       ROUND(avg(ud_weekly_problem_days),2) AS mean_weekly_days,
       ROUND(NTH(500, QUANTILES(ud_monthly_problem_seconds/60, 1001)),2) AS median_monthly_minutes,
       ROUND(avg(ud_monthly_problem_seconds/60),2) AS mean_monthly_minutes,
       ROUND(NTH(500, QUANTILES(ud_monthly_problems_done, 1001)),2) AS median_monthly_problems,
       ROUND(avg(ud_monthly_problems_done),2) AS mean_monthly_problems,
       ROUND(NTH(500, QUANTILES(percent_mastered, 1001)*100),2) AS median_percent_mastered,
       ROUND(avg(percent_mastered) * 100,2) AS mean_percent_mastered,
       COUNT(1) AS N
FROM [eli.worked_example_users]
GROUP BY alternative;

## curated_examples
SELECT * FROM eli.worked_example_problem_logs where exercise in(
    'multi-step-word-problems-with-whole-numbers',
    'comparing-with-multiplication',
    'percentage_word_problems_1',
    'recog-features-func-2',
    'constructing-proportions-to-solve-application-problems',
    'interpret-features-func-2'
    )

## getting_most_people_to_practiced
SELECT SUM(became_mastered) as N, exercise, SUM(if(alternative = 'control',became_practiced, 0)) as control_N,
                    SUM(if(alternative = 'control', 0, became_practiced)) as worked_examples_N,
                    SUM(if(alternative = 'control', 0, became_practiced) - if(alternative = 'control',became_practiced, 0))/SUM(if(alternative = 'control',became_practiced, 1)) as diff
                    FROM
(SELECT e.alternative as alternative,
p.exercise as exercise,
sum(num_exercises)  * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as N,
sum(became_practiced) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as became_practiced,
sum(became_mastered)  * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as became_mastered
FROM
(SELECT user_id, exercise, count(1) as num_exercises,
MAX(if(incoming_level = 'unstarted', 1, 0)) as was_ever_unstarted,
MAX(if(incoming_level = 'unstarted' and outgoing_level != 'unstarted', 1, 0)) as became_practiced,
MAX(if(outgoing_level = 'mastery3', 1, 0)) as became_mastered,
from eli.worked_example_problem_logs
-- ignore review problems
WHERE incoming_level != 'mastery3' group EACH by user_id, exercise
HAVING was_ever_unstarted = 1) p
JOIN EACH
(select top_alternative as alternative, ud_user_id from eli.worked_example_users where not regexp_match( top_alternative, 'curat'
)) e
ON e.ud_user_id = p.user_id
GROUP BY exercise, alternative
order by exercise, alternative)
GROUP BY exercise
HAVING N > 10
order by diff desc;

## hint_reading_levels.py
import json
import string

import curses
from curses.ascii import isdigit
import nltk
from nltk.corpus import cmudict

d = cmudict.dict()
# Only works on known words
def nsyl(word):
    return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]]

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

reading_levels = {}


def reading_level(str):
    sentences = sent_detector.tokenize(str.strip())
    num_valid_sentences = 0
    num_valid_words = 0
    num_valid_syllables = 0
    for sentence in sentences:
        print sentence
        words_this_sentence = 0
        words=nltk.tokenize.word_tokenize(sentence)
        for word in words:
            if word.lower() in d:
                words_this_sentence += 1
        if words_this_sentence >= 5:
            num_valid_sentences += 1
            for word in words:
                if word.lower() in d:
                    num_valid_words += 1
                    num_valid_syllables += nsyl(word)[0]
    if not num_valid_sentences:
        return 0
    return 0.39 * (num_valid_words/num_valid_sentences) + 11.8 * (num_valid_syllables/ num_valid_words) - 15.59


all_exercises = FrozenExercise.get_all()
for exercise in all_exercises:
    if exercise.name in reading_levels:
        continue
    item_ids = []
    for pt in exercise.problem_types:
        for i in pt['items']:
             item_ids.append(i['id'])
    probs = [AssessmentItem.get_by_id(i) for i in item_ids]
    all_hints = ''
    for problem in probs:
        h = string.join([hi.get('content', '') for hi in json.loads(problem.item_data).get('hints', '')], ' ')
        all_hints += h
    rl = reading_level(all_hints)
    print exercise.name, rl
    reading_levels[exercise.name] = rl

## relevant_exercises_stats
SELECT *,
       N/became_practiced AS mean_exs_per_practiced,
       N/became_mastered AS mean_exs_per_mastery
FROM
  (SELECT e.alternative AS alternative,
          sum(num_exercises) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS N,
          sum(became_practiced) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS became_practiced,
          sum(became_mastered) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS became_mastered
   FROM
     (SELECT user_id,
             exercise,
             count(1) AS num_exercises,
             MAX(if(incoming_level = 'unstarted', 1, 0)) AS was_ever_unstarted,
             MAX(if(incoming_level = 'unstarted'
                    AND outgoing_level != 'unstarted', 1, 0)) AS became_practiced,
             MAX(if(outgoing_level = 'mastery3', 1, 0)) AS became_mastered,
      FROM eli.worked_example_problem_logs -- ignore review problems

      WHERE incoming_level != 'mastery3'
      GROUP EACH BY user_id,
                    exercise HAVING was_ever_unstarted = 1) p
   JOIN EACH
     (SELECT top_alternative AS alternative,
             ud_user_id
      FROM eli.worked_example_users) e ON e.ud_user_id = p.user_id
   GROUP EACH BY alternative);

## useful_problem_logs.sql
SELECT p.user_id as user_id, p.exercise as exercise, p.correct as correct, p.task_type as task_type,
        p.incoming_level as incoming_level, p.outgoing_level as outgoing_level,
   case when p.time_taken < 1 then 1 when time_taken > 600 then 600 else time_taken end as time_taken
FROM
(SELECT * from [2014_05_31.ProblemLog] WHERE date_done_str > '2014-05-27')p
JOIN EACH
(select exercise from eli.worked_example_exercise) e
ON e.exercise = p.exercise;

## worked_example_users.sql
SELECT ud.*,
       mt.slug AS mission_slug,
       mt.title AS mission_title,
       mm.percent_mastered AS percent_mastered,
       bm.alternative as top_alternative,
       IFNULL(bm_p.alternative, '') as lower_alternative,
       CONCAT(bm.alternative, IFNULL(bm_p.alternative, '')) AS alternative
FROM
  (SELECT *
   FROM [latest_derived.user_segments]) ud
JOIN EACH
  (SELECT slug,
          title
   FROM eli.mission_title_slug) mt ON ud.active_mission = mt.title
JOIN EACH
  (SELECT bingo_id,
          alternative
   FROM [bingo.bingo_map]
   WHERE canonical_name='worked_examples') AS bm ON bm.bingo_id=ud.gae_bingo_identity
LEFT JOIN EACH
  (SELECT bingo_id,
          alternative
   FROM [bingo.bingo_map]
   WHERE canonical_name='worked_examples_popup') AS bm_p ON bm_p.bingo_id=ud.gae_bingo_identity
JOIN EACH
  (SELECT percent_mastered,
          user_email,
          mission
   FROM [latest_derived.mission_mastery]) mm ON ud.user_email = mm.user_email
AND mt.slug = mm.mission;
	SELECT top_alternative AS alternative,
	ROUND(NTH(500, QUANTILES(ud_points, 1001)), 2) AS median_points,
	ROUND(avg(ud_points),2) AS mean_points,
	ROUND(NTH(500, QUANTILES(ud_weekly_problem_days, 1001)),2) AS median_weekly_days,
	ROUND(avg(ud_weekly_problem_days),2) AS mean_weekly_days,
	ROUND(NTH(500, QUANTILES(ud_monthly_problem_seconds/60, 1001)),2) AS median_monthly_minutes,
	ROUND(avg(ud_monthly_problem_seconds/60),2) AS mean_monthly_minutes,
	ROUND(NTH(500, QUANTILES(ud_monthly_problems_done, 1001)),2) AS median_monthly_problems,
	ROUND(avg(ud_monthly_problems_done),2) AS mean_monthly_problems,
	ROUND(NTH(500, QUANTILES(percent_mastered, 1001)*100),2) AS median_percent_mastered,
	ROUND(avg(percent_mastered) * 100,2) AS mean_percent_mastered,
	COUNT(1) AS N
	FROM [eli.worked_example_users]
	GROUP BY alternative;
	SELECT * FROM eli.worked_example_problem_logs where exercise in(
	'multi-step-word-problems-with-whole-numbers',
	'comparing-with-multiplication',
	'percentage_word_problems_1',
	'recog-features-func-2',
	'constructing-proportions-to-solve-application-problems',
	'interpret-features-func-2'
	)
	SELECT SUM(became_mastered) as N, exercise, SUM(if(alternative = 'control',became_practiced, 0)) as control_N,
	SUM(if(alternative = 'control', 0, became_practiced)) as worked_examples_N,
	SUM(if(alternative = 'control', 0, became_practiced) - if(alternative = 'control',became_practiced, 0))/SUM(if(alternative = 'control',became_practiced, 1)) as diff
	FROM
	(SELECT e.alternative as alternative,
	p.exercise as exercise,
	sum(num_exercises) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as N,
	sum(became_practiced) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as became_practiced,
	sum(became_mastered) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) as became_mastered
	FROM
	(SELECT user_id, exercise, count(1) as num_exercises,
	MAX(if(incoming_level = 'unstarted', 1, 0)) as was_ever_unstarted,
	MAX(if(incoming_level = 'unstarted' and outgoing_level != 'unstarted', 1, 0)) as became_practiced,
	MAX(if(outgoing_level = 'mastery3', 1, 0)) as became_mastered,
	from eli.worked_example_problem_logs
	-- ignore review problems
	WHERE incoming_level != 'mastery3' group EACH by user_id, exercise
	HAVING was_ever_unstarted = 1) p
	JOIN EACH
	(select top_alternative as alternative, ud_user_id from eli.worked_example_users where not regexp_match( top_alternative, 'curat'
	)) e
	ON e.ud_user_id = p.user_id
	GROUP BY exercise, alternative
	order by exercise, alternative)
	GROUP BY exercise
	HAVING N > 10
	order by diff desc;
	import json
	import string

	import curses
	from curses.ascii import isdigit
	import nltk
	from nltk.corpus import cmudict

	d = cmudict.dict()
	# Only works on known words
	def nsyl(word):
	return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]]

	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

	reading_levels = {}


	def reading_level(str):
	sentences = sent_detector.tokenize(str.strip())
	num_valid_sentences = 0
	num_valid_words = 0
	num_valid_syllables = 0
	for sentence in sentences:
	print sentence
	words_this_sentence = 0
	words=nltk.tokenize.word_tokenize(sentence)
	for word in words:
	if word.lower() in d:
	words_this_sentence += 1
	if words_this_sentence >= 5:
	num_valid_sentences += 1
	for word in words:
	if word.lower() in d:
	num_valid_words += 1
	num_valid_syllables += nsyl(word)[0]
	if not num_valid_sentences:
	return 0
	return 0.39 * (num_valid_words/num_valid_sentences) + 11.8 * (num_valid_syllables/ num_valid_words) - 15.59




	all_exercises = FrozenExercise.get_all()
	for exercise in all_exercises:
	if exercise.name in reading_levels:
	continue
	item_ids = []
	for pt in exercise.problem_types:
	for i in pt['items']:
	item_ids.append(i['id'])
	probs = [AssessmentItem.get_by_id(i) for i in item_ids]
	all_hints = ''
	for problem in probs:
	h = string.join([hi.get('content', '') for hi in json.loads(problem.item_data).get('hints', '')], ' ')
	all_hints += h
	rl = reading_level(all_hints)
	print exercise.name, rl
	reading_levels[exercise.name] = rl
	SELECT *,
	N/became_practiced AS mean_exs_per_practiced,
	N/became_mastered AS mean_exs_per_mastery
	FROM
	(SELECT e.alternative AS alternative,
	sum(num_exercises) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS N,
	sum(became_practiced) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS became_practiced,
	sum(became_mastered) * if(regexp_match(alternative, 'show'), 2, 1) * if(regexp_match(alternative, 'curated'), 3, 1) AS became_mastered
	FROM
	(SELECT user_id,
	exercise,
	count(1) AS num_exercises,
	MAX(if(incoming_level = 'unstarted', 1, 0)) AS was_ever_unstarted,
	MAX(if(incoming_level = 'unstarted'
	AND outgoing_level != 'unstarted', 1, 0)) AS became_practiced,
	MAX(if(outgoing_level = 'mastery3', 1, 0)) AS became_mastered,
	FROM eli.worked_example_problem_logs -- ignore review problems

	WHERE incoming_level != 'mastery3'
	GROUP EACH BY user_id,
	exercise HAVING was_ever_unstarted = 1) p
	JOIN EACH
	(SELECT top_alternative AS alternative,
	ud_user_id
	FROM eli.worked_example_users) e ON e.ud_user_id = p.user_id
	GROUP EACH BY alternative);
	SELECT p.user_id as user_id, p.exercise as exercise, p.correct as correct, p.task_type as task_type,
	p.incoming_level as incoming_level, p.outgoing_level as outgoing_level,
	case when p.time_taken < 1 then 1 when time_taken > 600 then 600 else time_taken end as time_taken
	FROM
	(SELECT * from [2014_05_31.ProblemLog] WHERE date_done_str > '2014-05-27')p
	JOIN EACH
	(select exercise from eli.worked_example_exercise) e
	ON e.exercise = p.exercise;
	SELECT ud.*,
	mt.slug AS mission_slug,
	mt.title AS mission_title,
	mm.percent_mastered AS percent_mastered,
	bm.alternative as top_alternative,
	IFNULL(bm_p.alternative, '') as lower_alternative,
	CONCAT(bm.alternative, IFNULL(bm_p.alternative, '')) AS alternative
	FROM
	(SELECT *
	FROM [latest_derived.user_segments]) ud
	JOIN EACH
	(SELECT slug,
	title
	FROM eli.mission_title_slug) mt ON ud.active_mission = mt.title
	JOIN EACH
	(SELECT bingo_id,
	alternative
	FROM [bingo.bingo_map]
	WHERE canonical_name='worked_examples') AS bm ON bm.bingo_id=ud.gae_bingo_identity
	LEFT JOIN EACH
	(SELECT bingo_id,
	alternative
	FROM [bingo.bingo_map]
	WHERE canonical_name='worked_examples_popup') AS bm_p ON bm_p.bingo_id=ud.gae_bingo_identity
	JOIN EACH
	(SELECT percent_mastered,
	user_email,
	mission
	FROM [latest_derived.mission_mastery]) mm ON ud.user_email = mm.user_email
	AND mt.slug = mm.mission;