Skip to content

Instantly share code, notes, and snippets.

@rjurney
Created August 1, 2019 02:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/ca838833dba18f69a283321d16213b2b to your computer and use it in GitHub Desktop.
Save rjurney/ca838833dba18f69a283321d16213b2b to your computer and use it in GitHub Desktop.
|_Body|label_0|label_1|label_10|label_100|label_101|label_102|label_103|label_104|label_11|label_12|label_13|label_14|label_15|label_16|label_17|label_18|label_19|label_2|label_20|label_21|label_22|label_23|label_24|label_25|label_26|label_27|label_28|label_29|label_3|label_30|label_31|label_32|label_33|label_34|label_35|label_36|label_37|label_38|label_39|label_4|label_40|label_41|label_42|label_43|label_44|label_45|label_46|label_47|label_48|label_49|label_5|label_50|label_51|label_52|label_53|label_54|label_55|label_56|label_57|label_58|label_59|label_6|label_60|label_61|label_62|label_63|label_64|label_65|label_66|label_67|label_68|label_69|label_7|label_70|label_71|label_72|label_73|label_74|label_75|label_76|label_77|label_78|label_79|label_8|label_80|label_81|label_82|label_83|label_84|label_85|label_86|label_87|label_88|label_89|label_9|label_90|label_91|label_92|label_93|label_94|label_95|label_96|label_97|label_98|label_99|
# Evaluate how skewed the sample is
stratified_sample = spark.read.json('s3://stackoverflow-events/07-30-2019/Questions.Stratified.{}.*.jsonl'.format(limit))
stratified_sample.registerTempTable('stratified_sample')
label_counts = {}
for i in range(0, 100):
count_df = spark.sql('SELECT label_{}, COUNT(*) as total FROM stratified_sample GROUP BY label_{}'.format(i, i))
rows = count_df.rdd.take(2)
neg_count = getattr(rows[0], 'total')
pos_count = getattr(rows[1], 'total')
label_counts[i] = [neg_count, pos_count]
{0: [1034673, 14491],
1: [1023250, 25914],
2: [1030462, 18702],
3: [1035645, 13519],
4: [1037445, 11719],
5: [1010664, 38500],
6: [1031699, 17465],
7: [1031501, 17663],
8: [1033207, 15957],
9: [1035151, 14013],
10: [1018630, 30534],
11: [1025665, 23499],
12: [1032134, 17030],
13: [1037041, 12123],
14: [1030140, 19024],
15: [1029116, 20048],
16: [964140, 85024],
17: [1015889, 33275],
18: [1036636, 12528],
19: [1036906, 12258],
20: [1011727, 37437],
21: [1035123, 14041],
22: [1034847, 14317],
23: [1029004, 20160],
24: [1036227, 12937],
25: [1033307, 15857],
26: [1034969, 14195],
27: [1034847, 14317],
28: [1017546, 31618],
29: [1027919, 21245],
30: [1037028, 12136],
31: [1035912, 13252],
32: [1037308, 11856],
33: [1034225, 14939],
34: [1034921, 14243],
35: [1036984, 12180],
36: [1037617, 11547],
37: [1036210, 12954],
38: [999782, 49382],
39: [1033100, 16064],
40: [1035193, 13971],
41: [1003468, 45696],
42: [1027372, 21792],
43: [962634, 86530],
44: [957512, 91652],
45: [1001581, 47583],
46: [1025176, 23988],
47: [1036328, 12836],
48: [1034679, 14485],
49: [1028618, 20546],
50: [1033773, 15391],
51: [1035232, 13932],
52: [1035099, 14065],
53: [1037324, 11840],
54: [1036799, 12365],
55: [1035315, 13849],
56: [1034812, 14352],
57: [1016307, 32857],
58: [1028978, 20186],
59: [1036016, 13148],
60: [1022478, 26686],
61: [1035476, 13688],
62: [1034085, 15079],
63: [1034439, 14725],
64: [1036648, 12516],
65: [966255, 82909],
66: [1035025, 14139],
67: [1036879, 12285],
68: [966029, 83135],
69: [1034591, 14573],
70: [1031952, 17212],
71: [1037355, 11809],
72: [1034035, 15129],
73: [1036333, 12831],
74: [1029539, 19625],
75: [1035266, 13898],
76: [1027668, 21496],
77: [1021161, 28003],
78: [1036151, 13013],
79: [1037613, 11551],
80: [1037727, 11437],
81: [1033519, 15645],
82: [1035104, 14060],
83: [1032672, 16492],
84: [1005891, 43273],
85: [1023358, 25806],
86: [1036416, 12748],
87: [1031117, 18047],
88: [1028045, 21119],
89: [1037749, 11415],
90: [1037511, 11653],
91: [1036230, 12934],
92: [1035136, 14028],
93: [1035080, 14084],
94: [1035647, 13517],
95: [1036683, 12481],
96: [1033782, 15382],
97: [1021625, 27539],
98: [1035833, 13331],
99: [1033752, 15412]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment