boxnote.py (12281B) - raw
1 """ 2 The primary module for processing and parsing Box Notes 3 """ 4 import json 5 import os 6 import re 7 from functools import reduce 8 from xml.etree import ElementTree as ET 9 10 import typing 11 12 from . import html, markdown 13 from .table import Table 14 15 dir_path = os.path.dirname(os.path.realpath(__file__)) 16 17 18 class AttributeChunk: # not really a class 19 """ 20 An attribute chunk is formatted like this: 21 *n[*n...]+n[|n+n] 22 eg 23 *4*1+1|+1 24 where *n refers to an attribute to apply from the attribute pool 25 and +n is a number of characters to apply that attribute to 26 and |n is indicative of a line break (unclear the purpose of this) 27 """ 28 29 def __init__(self, attribute_string_chunk, position=None): 30 self.attribute_string_chunk = attribute_string_chunk 31 self.attributes = set(self._all_items_after_indicator("*")) 32 self.num_characters = sum(self._all_items_after_indicator("+")) 33 self.num_linebreaks = sum(self._all_items_after_indicator("|")) 34 35 def _all_items_after_indicator(self, indicator): 36 """ 37 Regex to get all the numbers after the given indicator 38 Then convert this base36 number into an integer 39 * -> attribute 40 + -> number of characters to apply attribute to 41 | -> number of linebreaks (I think) 42 """ 43 items = re.findall( 44 "\\{}([^\\+\\|\\*]*)".format(indicator), self.attribute_string_chunk 45 ) 46 return map(lambda x: int(x, 36), items) 47 48 49 class FormattedText: 50 """ 51 A block of text with parsed information about it 52 """ 53 54 def __init__(self, attributes, text, num_linebreaks, tagnums=None): 55 self.attributes = attributes 56 self.tagnums = list(tagnums) # for debugging 57 self.styles = self.get_base_styles() 58 self.num_linebreaks = num_linebreaks 59 self.text = text 60 self.element_tree = self.styles_to_elements() 61 self.table_id, self.row_id, self.column_id = self.get_table_info() 62 self.list_type, self.list_level = self.get_list_info() 63 return 64 65 def get_base_styles(self): 66 tags = [] 67 for attribute in self.attributes: 68 tags.append(html.convert_simple_element_to_html_tag(attribute)) 69 if not tags: 70 tags = html.HTMLTag("span", {}) # hmm 71 return tags 72 73 def get_table_info(self): 74 table_id = row_id = column_id = None 75 for box_attribute in self.attributes: 76 if html.get_table_info(box_attribute)[0]: 77 table = html.get_table_info(box_attribute) 78 if table_id and table[0] != table_id: 79 raise NotImplementedError(f"Encountered table id {table[0]} but was expecting {table_id}") 80 81 table_id = table[0] 82 if table[1]: 83 row_id = table[1] 84 if table[2]: 85 column_id = table[2] 86 return table_id, row_id, column_id 87 88 def get_list_info(self): # refactor 89 for box_attribute in self.attributes: 90 if html.get_list_info(box_attribute): 91 return html.get_list_info(box_attribute) 92 return None, None 93 94 def styles_to_elements(self): 95 if self.text.replace("\n", "") == "*": # maybe change 96 self.text = "" 97 98 # LISTS HACK -- After much anguish, I have resorted myself to the dark arts 99 # Please forgive me 100 # We are using <li/> to represent list items 101 if "li" in [a.tag for a in self.styles]: 102 span = ET.Element("span") 103 indent_level = self.get_list_info()[1] - 1 or 0 104 span.text = indent_level * " " + "* " 105 return span 106 107 def _append(x, y): 108 y.append(x) 109 return y 110 111 individual_elements = list( 112 map(lambda x: ET.Element(x.tag, x.attributes), self.styles) 113 ) 114 reduce(_append, individual_elements) 115 lowest_element = individual_elements[0] # indexerror 116 toplevel_element = individual_elements[-1] 117 for _ in range(self.num_linebreaks): 118 toplevel_element.append(ET.Element("br")) # Hm 119 lowest_element.text = self.text 120 return toplevel_element 121 122 def styles_to_markdown_string(self): 123 # escape markdown characters 124 # kind of awkward 125 characters_to_escape = "\\*[]`" 126 tmp = self.text or "" 127 out_text = "" 128 if tmp is not None: 129 for character in characters_to_escape: 130 tmp = tmp.replace(character, "\\{}".format(character)) 131 for line in tmp.split("\n"): 132 _prefix = "" 133 134 for box_attribute in self.attributes: 135 if line or box_attribute[0] in ["list", "image", "link"]: 136 start, end, prefix = markdown.convert_simple_element_to_markdown( 137 box_attribute 138 ) 139 if prefix: 140 # Hacky solution to ensure headers always appear at the start of the line 141 _prefix += start 142 line = line + end 143 else: 144 line = start + line + end 145 146 out_text += _prefix + line 147 out_text += "\n" * (self.num_linebreaks) 148 149 return out_text 150 151 def __repr__(self): 152 return json.dumps( 153 {k: v for k, v in self.__dict__.items() if k != "element_tree"}, indent=2 154 ) 155 156 157 class BoxNote: 158 NOTE_MAPPING = [] # MAPPING FROM ATTRIB TO HTML TAG 159 160 def __init__(self, note_string): # TODO: rename notefile to notefilepath 161 """ 162 note_string: the note data as a string 163 text is the raw text of the notes document. 164 attributes is the attribute formatting string 165 attribute pool is all the attributes that are used and a conversion from 166 numattribute number to some html-like formatting 167 """ 168 self.note_data = json.loads(note_string) 169 self.text = self.note_data["atext"]["text"] 170 self.attribute_chunks = self._attribute_chunks_from_string( 171 self.note_data["atext"]["attribs"] 172 ) 173 self.attribute_pool = self.note_data["pool"]["numToAttrib"] 174 # config? 175 176 @classmethod 177 def from_file(cls, filepath): 178 with open(filepath, encoding="utf8") as f: 179 return cls(note_string=f.read()) 180 181 def get_metadata(self): 182 """ 183 returns potentially useful metadata about the file. ignores more obscure 184 metadata that is mostly for internal user. WIP, currently unused 185 """ 186 metadata = {"last_edit_timestamp": self.note_data.get("lastEditTimestamp")} 187 return metadata 188 189 @staticmethod 190 def _attribute_chunks_from_string(attributes_string): 191 return map(AttributeChunk, re.findall("\\*.*?\\+[^\\*]*", attributes_string)) 192 193 def _get_formatted_text_list(self): 194 text = self.text 195 output = [] 196 pointer = 0 197 for chunk in self.attribute_chunks: 198 attributes = [ 199 self.attribute_pool[str(attribute_number)] 200 for attribute_number in chunk.attributes 201 ] 202 element_text = text[pointer : pointer + chunk.num_characters] 203 blob = FormattedText( 204 attributes, element_text, chunk.num_linebreaks, tagnums=chunk.attributes 205 ) 206 output.append(blob) 207 pointer += chunk.num_characters 208 return output 209 210 def as_element_tree(self): 211 body = ET.Element("body") 212 blobs = self._get_formatted_text_list() 213 for blob in blobs: 214 body.append(blob.element_tree) 215 return body 216 217 def as_html(self): 218 output = '<!DOCTYPE html><html><head><meta charset="utf-8"/>' 219 with open(os.path.join(dir_path, "style.css")) as f: 220 output += "<style>" + f.read() + "</style></head>" 221 output += ET.tostring(self.as_element_tree(), encoding="unicode").replace( 222 "&nbsp;", " " 223 ) 224 return output + "</html>" 225 226 def as_markdown(self): 227 """ 228 Return this note as markdown. 229 230 ## Notes about tables 231 232 1. A new row starts with start with `struct-table[hash]_col[hash]` and then `struct-table[hash]_row[hash]`. 233 2. The continuation of a row can be identified by `struct-table[hash]_row[hash]` and then `struct-table[hash]_col[hash]` 234 3. The FormattedText that appears directly before a `struct-table[hash]_col|row[hash]` contains the content for 235 the cell. 236 4. There doesn't appear to be an indication of a header row 237 5. There can be multiple blobs of data before the `struct-table[hash]_col|row[hash]` and therefore you need to 238 capture this data so that it can be inserted into a table cell. 239 6. BUT... there doesn't seem to be any indication that a table has finished. In some cases the last table cell 240 will contain a \n\n but not always. 241 Thus, this is why in this method we add the data to the stack, but then if we detect there is a table cell 242 that hasn't been filled, we fill it with any data since the previously encountered table cell. 243 """ 244 245 #: A dict of data that makes up the box note. 246 # 247 # The key will either be; 248 # 1. An integer derived from the index of blobs; or 249 # 2. A string that references a table_id derived from the Box Note attribute 250 # `struct-table[table-id]_col|row[hash]` 251 out: typing.Dict[typing.Union[int, str], typing.Union[str, Table]] = {} 252 253 #: A list of blob indexes that are captured so they can be placed within a table cell upon discovery. 254 captures: typing.List[int] = [] 255 256 blobs = self._get_formatted_text_list() 257 258 for i, blob in enumerate(blobs): 259 if blob.table_id: 260 # This blob contains reference to a table 261 # 262 # Some previously captured data forms the data that will be placed within this particular table cell. 263 264 if blob.table_id not in out: 265 # This is the first time we've come across this table, so creat an instance of Table in which 266 # we can start to place data in. 267 out[blob.table_id] = Table() 268 269 # Combine any text previously captured together. 270 data = ''.join([ 271 out.pop(capture) 272 for capture in captures 273 ]) 274 275 if len(data) > 0: 276 # Add the previously captured data to the table 277 # 278 # Table relies on a dictionary (which now honours the insertion order) to ensure that this data 279 # will be in the correct row/column and that that row/column is rendered in the correct place 280 # on output. 281 out[blob.table_id].add_data(blob.row_id, blob.column_id, data) 282 283 captures = [] 284 285 else: 286 if blob.num_linebreaks == 0: 287 # Capture a reference to this data in case it needs to be placed inside a table cell 288 captures.append(i) 289 else: 290 # We reset the capture when there is a line break. 291 captures = [] 292 293 out[i] = blob.styles_to_markdown_string() 294 295 doc = ''.join([ 296 o.render_markdown() if hasattr(o, "render_markdown") else o 297 for o in out.values() 298 ]) 299 300 # TODO: Better support for cleaning up the doc 301 cleanup = ( 302 # Box Notes can give you text in a table cell like `**H****ello**` - this is invalid Markdown and we want 303 # to convert it to `**Hello**`. 304 ('****', ''), 305 ) 306 307 for search, replace in cleanup: 308 doc = doc.replace(search, replace) 309 310 return doc 311 312 def as_text(self): 313 return self.text 314 315 def __str__(self): 316 return json.dumps(self.note_data, indent=2)