Package json_to_relation :: Module generic_json_parser
[hide private]
[frames] | no frames]

Source Code for Module json_to_relation.generic_json_parser

  1  ''' 
  2  Created on Sep 23, 2013 
  3   
  4  @author: paepcke 
  5  ''' 
  6  import StringIO 
  7  import ijson 
  8  import re 
  9   
 10  from col_data_type import ColDataType 
 11   
 12   
13 -class GenericJSONParser(object):
14 ''' 15 Takes a JSON string, and returns a CSV row for later import into a relational database. 16 ''' 17 # Regex pattern to remove '.item.' from column header names. 18 # (see removeItemFromString(). Example: employee.item.name 19 # will be replaced by employee.name. When used in r.search(), 20 # the regex below creates a Match result instance with two 21 # groups: 'item' and 'name'. 22 REMOVE_ITEM_FROM_STRING_PATTERN = re.compile(r'(item)\.([^.]*$)') 23
24 - def __init__(self, jsonToRelationConverter):
25 ''' 26 27 @param jsonToRelationConverter: JSONToRelation instance 28 @type jsonToRelationConverter: JSONToRelation 29 ''' 30 ''' 31 Constructor 32 ''' 33 self.jsonToRelationConverter = jsonToRelationConverter
34
35 - def processOneJSONObject(self, jsonStr, row):
36 ''' 37 Given a JSON string that is one entire JSON object, parse the 38 string into nested dicts. Derive relational column names from the 39 (possibly nested) labels. Cooperate with the parent JSONToRelations 40 instance to build a schema of typed SQL columns. Fill the passed-in 41 row with values from the JSON string. The following mappings from 42 Python values are used:: 43 ('null', None) 44 ('boolean', <true orfFalse>) 45 ('number', <int or Decimal>) 46 ('string', <unicode>) 47 ('map_key', <str>) 48 ('start_map', None) 49 ('end_map', None) 50 ('start_array', None) 51 ('end_array', None) 52 53 @param jsonStr: string of a single, self contained JSON object 54 @type jsonStr: String 55 @param row: partially filled array of values. 56 @type row: List<<any>> 57 ''' 58 parser = ijson.parse(StringIO.StringIO(jsonStr)) 59 # Stack of array index counters for use with 60 # nested arrays: 61 arrayIndexStack = Stack() 62 # Not currently processing 63 #for prefix,event,value in self.parser: 64 for nestedLabel, event, value in parser: 65 #print("Nested label: %s; event: %s; value: %s" % (nestedLabel,event,value)) 66 if event == "start_map": 67 if not arrayIndexStack.empty(): 68 # Starting a new attribute/value pair within an array: need 69 # a new number to differentiate column headers 70 self.incArrayIndex(arrayIndexStack) 71 continue 72 73 if (len(nestedLabel) == 0) or\ 74 (event == "map_key") or\ 75 (event == "end_map"): 76 continue 77 78 if not arrayIndexStack.empty(): 79 # Label is now something like 80 # employees.item.firstName. The 'item' is ijson's way of indicating 81 # that we are in an array. Remove the '.item.' part; it makes 82 # the relation column header unnecessarily long. Then append 83 # our array index number with an underscore: 84 nestedLabel = self.removeItemPartOfString(nestedLabel) +\ 85 '_' +\ 86 str(arrayIndexStack.top(exceptionOnEmpty=True)) 87 88 # Ensure that label contains only MySQL-legal identifier chars. Else 89 # quote the label: 90 nestedLabel = self.jsonToRelationConverter.ensureLegalIdentifierChars(nestedLabel) 91 92 # Check whether caller gave a type hint for this column: 93 try: 94 colDataType = self.jsonToRelationConverter.schemaHints[nestedLabel] 95 except KeyError: 96 colDataType = None 97 98 if event == "string": 99 if colDataType is None: 100 colDataType = ColDataType.TEXT 101 self.jsonToRelationConverter.ensureColExistence(nestedLabel, colDataType) 102 self.setValInRow(row, nestedLabel, value) 103 continue 104 105 if event == "boolean": 106 if colDataType is None: 107 colDataType = ColDataType.SMALLINT 108 self.jsonToRelationConverter.ensureColExistence(nestedLabel, colDataType) 109 if value: 110 value = 1 111 else: 112 value = 0 113 self.setValInRow(row, nestedLabel,value) 114 continue 115 116 if event == "number": 117 if colDataType is None: 118 colDataType = ColDataType.DOUBLE 119 self.jsonToRelationConverter.ensureColExistence(nestedLabel, colDataType) 120 self.setValInRow(row, nestedLabel,value) 121 continue 122 123 if event == "null": 124 if colDataType is None: 125 colDataType = ColDataType.TEXT 126 self.jsonToRelationConverter.ensureColExistence(nestedLabel, colDataType) 127 self.setValInRow(row, nestedLabel, '') 128 continue 129 130 if event == "start_array": 131 # New array index entry for this nested label. 132 # Used to generate <label>_0, <label>_1, etc. for 133 # column names: 134 arrayIndexStack.push(-1) 135 continue 136 137 if event == "end_array": 138 # Array closed; forget the array counter: 139 arrayIndexStack.pop() 140 continue 141 142 raise ValueError("Unknown JSON value type at %s for value %s (ijson event: %s)" % (nestedLabel,value,str(event))) 143 return row
144
145 - def setValInRow(self, theRow, colName, value):
146 ''' 147 Given a column name, a value and a partially filled row, 148 add the column to the row, or set the value in an already 149 existing row. Uses the JSONToRelation instance passed to 150 __init__() to obtain current schema. 151 @param theRow: list of values in their proper column positions 152 @type theRow: List<<any>> 153 @param colName: name of column into which value is to be inserted. 154 @type colName: String 155 @param value: the field value 156 @type value: <any>, as per ColDataType 157 ''' 158 # Assumes caller has called ensureColExistence() on the 159 # JSONToRelation object; so the following won't have 160 # a key failure: 161 colSpec = self.jsonToRelationConverter.cols[colName] 162 targetPos = colSpec.colPos 163 # Is value to go just beyond the current row len? 164 if (len(theRow) == 0 or len(theRow) == targetPos): 165 theRow.append(value) 166 return theRow 167 # Is value to go into an already existing column? 168 if (len(theRow) > targetPos): 169 theRow[targetPos] = value 170 return theRow 171 172 # Adding a column beyond the current end of the row, but 173 # not just by one position. 174 # Won't usually happen, as we just keep adding cols as 175 # we go, but taking care of this case makes for flexibility: 176 # Make a list that spans the missing columns, and fill 177 # it with nulls; then concat that list with theRow: 178 fillList = ['null']*(targetPos - len(theRow)) 179 fillList.append(value) 180 theRow.extend(fillList) 181 return theRow
182 183
184 - def incArrayIndex(self, arrayIndexStack):
185 currArrayIndex = arrayIndexStack.pop() 186 currArrayIndex += 1 187 arrayIndexStack.push(currArrayIndex)
188
189 - def decArrayIndex(self, arrayIndexStack):
190 currArrayIndex = arrayIndexStack.pop() 191 currArrayIndex -= 1 192 arrayIndexStack.push(currArrayIndex)
193
194 - def removeItemPartOfString(self, label):
195 ''' 196 Given a label, like employee.item.name, remove the last 197 occurrence of 'item' 198 @param label: string from which last 'item' occurrence is to be removed 199 @type label: String 200 ''' 201 # JSONToRelation.REMOVE_ITEM_FROM_STRING_PATTERN is a regex pattern to remove '.item.' 202 # from column header names. Example: employee.item.name 203 # will be replaced by employee.name. When used in r.search(), 204 # the regex below creates a Match result instance with two 205 # groups: 'item' and 'name'. 206 match = re.search(GenericJSONParser.REMOVE_ITEM_FROM_STRING_PATTERN, label) 207 if match is None: 208 # no appropriate occurrence of 'item' fround 209 return label 210 # Get label portion up to last occurrence of 'item', 211 # and add the last part of the label to that part: 212 res = label[:match.start(1)] + match.group(2) 213 return res
214
215 -class Stack(object):
216
217 - def __init__(self):
218 self.stackArray = []
219
220 - def empty(self):
221 return len(self.stackArray) == 0
222
223 - def push(self, item):
224 self.stackArray.append(item)
225
226 - def pop(self):
227 try: 228 return self.stackArray.pop() 229 except IndexError: 230 raise ValueError("Stack empty.")
231
232 - def top(self, exceptionOnEmpty=False):
233 if len(self.stackArray) == 0: 234 if exceptionOnEmpty: 235 raise ValueError("Call to Stack instance method 'top' when stack is empty.") 236 else: 237 return None 238 return self.stackArray[len(self.stackArray) -1]
239
240 - def stackHeight(self):
241 return len(self.stackArray)
242