|
Package json_to_relation ::
Module edxTrackLogJSONParser
|
|
1 '''
2 Created on Oct 2, 2013
3
4 @author: paepcke
5 '''
6 import json
7
8 from col_data_type import ColDataType
9 from generic_json_parser import GenericJSONParser
10 import logging
11
12
14 '''
15 Parser specialized for EdX track logs.
16 '''
17
18 - def __init__(self, jsonToRelationConverter):
19 '''
20 Constructor
21 @param jsonToRelationConverter: JSONToRelation instance
22 @type jsonToRelationConverter: JSONToRelation
23 '''
24 super(EdXTrackLogJSONParser, self).__init__(jsonToRelationConverter)
25
27 '''
28 Given one line from the EdX Track log, produce one row
29 of relational output. Return is an array of values, the
30 same that is passed in. On the way, the partne JSONToRelation
31 object is called to ensure that JSON fields for which new columns
32 have not been created yet receive a place in the row array.
33 Different types of JSON records will be passed: server heartbeats,
34 dashboard accesses, account creations, user logins. Example record
35 for the latter::
36 {"username": "",
37 "host": "class.stanford.edu",
38 "event_source": "server",
39 "event_type": "/accounts/login",
40 "time": "2013-06-14T00:31:57.661338",
41 "ip": "98.230.189.66",
42 "event": "{
43 \"POST\": {},
44 \"GET\": {
45 \"next\": [\"/courses/Medicine/HRP258/Statistics_in_Medicine/courseware/80160e.../\"]}}",
46 "agent": "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101
47 Firefox/21.0",
48 "page": null}
49
50 @param jsonStr: string of a single, self contained JSON object
51 @type jsonStr: String
52 @param row: partially filled array of values. Passed by reference
53 @type row: List<<any>>
54 @return: the filled-in row
55 @rtype: [<any>]
56 '''
57 try:
58 record = json.loads(jsonStr)
59 for attribute, value in record.iteritems():
60
61
62
63 if (attribute == 'event' and value and not isinstance(value, dict)):
64
65 nestedValue = json.loads(value)
66 record['fullCourseRef'] = nestedValue['GET']['next'][0]
67
68 (fullCourseName, course_id) = get_course_id(record)
69 if course_id is not None:
70 record['course_id'] = course_id
71 if fullCourseName is not None:
72 record['fullCourseName'] = fullCourseName
73 except Exception as e:
74
75 logging.error("While importing EdX track log event: " + `e`)
76
77 if record is not None:
78 for jsonFldName in record.keys():
79 fldValue = record[jsonFldName]
80
81 try:
82 colDataType = self.jsonToRelationConverter.schemaHints[jsonFldName]
83 except KeyError:
84 colDataType = ColDataType.sqlTypeFromValue(fldValue)
85
86 self.jsonToRelationConverter.ensureColExistence(jsonFldName, colDataType)
87 self.setValInRow(row, jsonFldName, fldValue)
88
89 return row
90
91 def get_course_id(event):
92 '''
93 Given a 'pythonized' JSON tracking event object, find
94 the course URL, and extract the course name from it.
95 A number of different events occur, which do not contain
96 course IDs: server heartbeats, account creation, dashboard
97 accesses. Among them are logins, which look like this:
98
99 {"username": "",
100 "host": "class.stanford.edu",
101 "event_source": "server",
102 "event_type": "/accounts/login",
103 "time": "2013-06-14T00:31:57.661338",
104 "ip": "98.230.189.66",
105 "event": "{
106 \"POST\": {},
107 \"GET\": {
108 \"next\": [\"/courses/Medicine/HRP258/Statistics_in_Medicine/courseware/80160e.../\"]}}",
109 "agent": "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101
110 Firefox/21.0",
111 "page": null}
112
113 Notice the 'event' key's value being a *string* containing JSON, rather than
114 a nested JSON object. This requires special attention. Buried inside
115 that string is the 'next' tag, whose value is an array with a long (here
116 partially elided) hex number. This is where the course number is
117 extracted.
118
119 @param event: JSON record of an edx tracking event as internalized dict
120 @type event: Dict<String,Dict<<any>>
121 @return: two-tuple: fulle name of course in which event occurred, and descriptive name.
122 None if course ID could not be obtained.
123 @rtype: {(String,String) | None}
124 '''
125 course_id = None
126 if event['event_source'] == 'server':
127
128 if event['event_type'] == '/accounts/login/':
129 fullCourseName = event['event']['GET']['next'][0]
130 else:
131 fullCourseName = event['event_type']
132 else:
133 fullCourseName = event['page']
134 if fullCourseName:
135 courseNameFrags = fullCourseName.split('/')
136 if 'courses' in courseNameFrags:
137 i = courseNameFrags.index('courses')
138 course_id = "/".join(map(str, courseNameFrags[i+1:i+4]))
139 return (fullCourseName, course_id)
140