Package json_to_relation :: Module edxTrackLogJSONParser
[hide private]
[frames] | no frames]

Source Code for Module json_to_relation.edxTrackLogJSONParser

  1  ''' 
  2  Created on Oct 2, 2013 
  3   
  4  @author: paepcke 
  5  ''' 
  6  import json 
  7   
  8  from col_data_type import ColDataType 
  9  from generic_json_parser import GenericJSONParser 
 10  import logging 
 11   
 12   
13 -class EdXTrackLogJSONParser(GenericJSONParser):
14 ''' 15 Parser specialized for EdX track logs. 16 ''' 17
18 - def __init__(self, jsonToRelationConverter):
19 ''' 20 Constructor 21 @param jsonToRelationConverter: JSONToRelation instance 22 @type jsonToRelationConverter: JSONToRelation 23 ''' 24 super(EdXTrackLogJSONParser, self).__init__(jsonToRelationConverter)
25
26 - def processOneJSONObject(self, jsonStr, row):
27 ''' 28 Given one line from the EdX Track log, produce one row 29 of relational output. Return is an array of values, the 30 same that is passed in. On the way, the partne JSONToRelation 31 object is called to ensure that JSON fields for which new columns 32 have not been created yet receive a place in the row array. 33 Different types of JSON records will be passed: server heartbeats, 34 dashboard accesses, account creations, user logins. Example record 35 for the latter:: 36 {"username": "", 37 "host": "class.stanford.edu", 38 "event_source": "server", 39 "event_type": "/accounts/login", 40 "time": "2013-06-14T00:31:57.661338", 41 "ip": "98.230.189.66", 42 "event": "{ 43 \"POST\": {}, 44 \"GET\": { 45 \"next\": [\"/courses/Medicine/HRP258/Statistics_in_Medicine/courseware/80160e.../\"]}}", 46 "agent": "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 47 Firefox/21.0", 48 "page": null} 49 50 @param jsonStr: string of a single, self contained JSON object 51 @type jsonStr: String 52 @param row: partially filled array of values. Passed by reference 53 @type row: List<<any>> 54 @return: the filled-in row 55 @rtype: [<any>] 56 ''' 57 try: 58 record = json.loads(jsonStr) 59 for attribute, value in record.iteritems(): 60 # Find cases in which the 'event' value is a *string* that 61 # contains a JSON expression, as opposed to a JSON sub-object, 62 # which manifests as a Python dict: 63 if (attribute == 'event' and value and not isinstance(value, dict)): 64 # hack to load the record when it is encoded as a string 65 nestedValue = json.loads(value) 66 record['fullCourseRef'] = nestedValue['GET']['next'][0] 67 # Dig the course ID out of JSON records that happen to be user logins: 68 (fullCourseName, course_id) = get_course_id(record) 69 if course_id is not None: 70 record['course_id'] = course_id 71 if fullCourseName is not None: 72 record['fullCourseName'] = fullCourseName 73 except Exception as e: 74 # TODO: handle different types of exceptions 75 logging.error("While importing EdX track log event: " + `e`) 76 77 if record is not None: 78 for jsonFldName in record.keys(): 79 fldValue = record[jsonFldName] 80 # Check whether caller gave a type hint for this column: 81 try: 82 colDataType = self.jsonToRelationConverter.schemaHints[jsonFldName] 83 except KeyError: 84 colDataType = ColDataType.sqlTypeFromValue(fldValue) 85 86 self.jsonToRelationConverter.ensureColExistence(jsonFldName, colDataType) 87 self.setValInRow(row, jsonFldName, fldValue) 88 89 return row 90 91 def get_course_id(event): 92 ''' 93 Given a 'pythonized' JSON tracking event object, find 94 the course URL, and extract the course name from it. 95 A number of different events occur, which do not contain 96 course IDs: server heartbeats, account creation, dashboard 97 accesses. Among them are logins, which look like this: 98 99 {"username": "", 100 "host": "class.stanford.edu", 101 "event_source": "server", 102 "event_type": "/accounts/login", 103 "time": "2013-06-14T00:31:57.661338", 104 "ip": "98.230.189.66", 105 "event": "{ 106 \"POST\": {}, 107 \"GET\": { 108 \"next\": [\"/courses/Medicine/HRP258/Statistics_in_Medicine/courseware/80160e.../\"]}}", 109 "agent": "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 110 Firefox/21.0", 111 "page": null} 112 113 Notice the 'event' key's value being a *string* containing JSON, rather than 114 a nested JSON object. This requires special attention. Buried inside 115 that string is the 'next' tag, whose value is an array with a long (here 116 partially elided) hex number. This is where the course number is 117 extracted. 118 119 @param event: JSON record of an edx tracking event as internalized dict 120 @type event: Dict<String,Dict<<any>> 121 @return: two-tuple: fulle name of course in which event occurred, and descriptive name. 122 None if course ID could not be obtained. 123 @rtype: {(String,String) | None} 124 ''' 125 course_id = None 126 if event['event_source'] == 'server': 127 # get course_id from event type 128 if event['event_type'] == '/accounts/login/': 129 fullCourseName = event['event']['GET']['next'][0] 130 else: 131 fullCourseName = event['event_type'] 132 else: 133 fullCourseName = event['page'] 134 if fullCourseName: 135 courseNameFrags = fullCourseName.split('/') 136 if 'courses' in courseNameFrags: 137 i = courseNameFrags.index('courses') 138 course_id = "/".join(map(str, courseNameFrags[i+1:i+4])) 139 return (fullCourseName, course_id)
140