boxnotes2html

Convert Box's proprietary Box Notes to HTML, Markdown, or plain text
git clone git://git.alexwennerberg.com/boxnotes2html
Log | Files | Refs | README | LICENSE

boxnote.py (12281B) - raw


      1 """
      2 The primary module for processing and parsing Box Notes
      3 """
      4 import json
      5 import os
      6 import re
      7 from functools import reduce
      8 from xml.etree import ElementTree as ET
      9 
     10 import typing
     11 
     12 from . import html, markdown
     13 from .table import Table
     14 
     15 dir_path = os.path.dirname(os.path.realpath(__file__))
     16 
     17 
     18 class AttributeChunk:  # not really a class
     19     """
     20     An attribute chunk is formatted like this:
     21     *n[*n...]+n[|n+n]
     22     eg
     23     *4*1+1|+1
     24     where *n refers to an attribute to apply from the attribute pool
     25     and +n is a number of characters to apply that attribute to
     26     and |n is indicative of a line break (unclear the purpose of this)
     27     """
     28 
     29     def __init__(self, attribute_string_chunk, position=None):
     30         self.attribute_string_chunk = attribute_string_chunk
     31         self.attributes = set(self._all_items_after_indicator("*"))
     32         self.num_characters = sum(self._all_items_after_indicator("+"))
     33         self.num_linebreaks = sum(self._all_items_after_indicator("|"))
     34 
     35     def _all_items_after_indicator(self, indicator):
     36         """
     37         Regex to get all the numbers after the given indicator
     38         Then convert this base36 number into an integer
     39         * -> attribute
     40         + -> number of characters to apply attribute to
     41         | -> number of linebreaks (I think)
     42         """
     43         items = re.findall(
     44             "\\{}([^\\+\\|\\*]*)".format(indicator), self.attribute_string_chunk
     45         )
     46         return map(lambda x: int(x, 36), items)
     47 
     48 
     49 class FormattedText:
     50     """
     51     A block of text with parsed information about it
     52     """
     53 
     54     def __init__(self, attributes, text, num_linebreaks, tagnums=None):
     55         self.attributes = attributes
     56         self.tagnums = list(tagnums)  # for debugging
     57         self.styles = self.get_base_styles()
     58         self.num_linebreaks = num_linebreaks
     59         self.text = text
     60         self.element_tree = self.styles_to_elements()
     61         self.table_id, self.row_id, self.column_id = self.get_table_info()
     62         self.list_type, self.list_level = self.get_list_info()
     63         return
     64 
     65     def get_base_styles(self):
     66         tags = []
     67         for attribute in self.attributes:
     68             tags.append(html.convert_simple_element_to_html_tag(attribute))
     69         if not tags:
     70             tags = html.HTMLTag("span", {})  # hmm
     71         return tags
     72 
     73     def get_table_info(self):
     74         table_id = row_id = column_id = None
     75         for box_attribute in self.attributes:
     76             if html.get_table_info(box_attribute)[0]:
     77                 table = html.get_table_info(box_attribute)
     78                 if table_id and table[0] != table_id:
     79                     raise NotImplementedError(f"Encountered table id {table[0]} but was expecting {table_id}")
     80 
     81                 table_id = table[0]
     82                 if table[1]:
     83                     row_id = table[1]
     84                 if table[2]:
     85                     column_id = table[2]
     86         return table_id, row_id, column_id
     87 
     88     def get_list_info(self):  # refactor
     89         for box_attribute in self.attributes:
     90             if html.get_list_info(box_attribute):
     91                 return html.get_list_info(box_attribute)
     92         return None, None
     93 
     94     def styles_to_elements(self):
     95         if self.text.replace("\n", "") == "*":  # maybe change
     96             self.text = ""
     97 
     98         # LISTS HACK -- After much anguish, I have resorted myself to the dark arts
     99         # Please forgive me
    100         # We are using <li/> to represent list items
    101         if "li" in [a.tag for a in self.styles]:
    102             span = ET.Element("span")
    103             indent_level = self.get_list_info()[1] - 1 or 0
    104             span.text = indent_level * "&nbsp;&nbsp;" + "* "
    105             return span
    106 
    107         def _append(x, y):
    108             y.append(x)
    109             return y
    110 
    111         individual_elements = list(
    112             map(lambda x: ET.Element(x.tag, x.attributes), self.styles)
    113         )
    114         reduce(_append, individual_elements)
    115         lowest_element = individual_elements[0]  # indexerror
    116         toplevel_element = individual_elements[-1]
    117         for _ in range(self.num_linebreaks):
    118             toplevel_element.append(ET.Element("br"))  # Hm
    119         lowest_element.text = self.text
    120         return toplevel_element
    121 
    122     def styles_to_markdown_string(self):
    123         # escape markdown characters
    124         # kind of awkward
    125         characters_to_escape = "\\*[]`"
    126         tmp = self.text or ""
    127         out_text = ""
    128         if tmp is not None:
    129             for character in characters_to_escape:
    130                 tmp = tmp.replace(character, "\\{}".format(character))
    131             for line in tmp.split("\n"):
    132                 _prefix = ""
    133                 
    134                 for box_attribute in self.attributes:
    135                     if line or box_attribute[0] in ["list", "image", "link"]:
    136                         start, end, prefix = markdown.convert_simple_element_to_markdown(
    137                             box_attribute
    138                         )
    139                         if prefix:
    140                             # Hacky solution to ensure headers always appear at the start of the line
    141                             _prefix += start
    142                             line = line + end
    143                         else:
    144                             line = start + line + end
    145                             
    146                 out_text += _prefix + line
    147             out_text += "\n" * (self.num_linebreaks)
    148 
    149         return out_text
    150 
    151     def __repr__(self):
    152         return json.dumps(
    153             {k: v for k, v in self.__dict__.items() if k != "element_tree"}, indent=2
    154         )
    155 
    156 
    157 class BoxNote:
    158     NOTE_MAPPING = []  # MAPPING FROM ATTRIB TO HTML TAG
    159 
    160     def __init__(self, note_string):  # TODO: rename notefile to notefilepath
    161         """
    162         note_string: the note data as a string
    163         text is the raw text of the notes document.
    164         attributes is the attribute formatting string
    165         attribute pool is all the attributes that are used and a conversion from
    166         numattribute number to some html-like formatting
    167         """
    168         self.note_data = json.loads(note_string)
    169         self.text = self.note_data["atext"]["text"]
    170         self.attribute_chunks = self._attribute_chunks_from_string(
    171             self.note_data["atext"]["attribs"]
    172         )
    173         self.attribute_pool = self.note_data["pool"]["numToAttrib"]
    174         # config?
    175 
    176     @classmethod
    177     def from_file(cls, filepath):
    178         with open(filepath, encoding="utf8") as f:
    179             return cls(note_string=f.read())
    180 
    181     def get_metadata(self):
    182         """
    183         returns potentially useful metadata about the file. ignores more obscure
    184         metadata that is mostly for internal user. WIP, currently unused
    185         """
    186         metadata = {"last_edit_timestamp": self.note_data.get("lastEditTimestamp")}
    187         return metadata
    188 
    189     @staticmethod
    190     def _attribute_chunks_from_string(attributes_string):
    191         return map(AttributeChunk, re.findall("\\*.*?\\+[^\\*]*", attributes_string))
    192 
    193     def _get_formatted_text_list(self):
    194         text = self.text
    195         output = []
    196         pointer = 0
    197         for chunk in self.attribute_chunks:
    198             attributes = [
    199                 self.attribute_pool[str(attribute_number)]
    200                 for attribute_number in chunk.attributes
    201             ]
    202             element_text = text[pointer : pointer + chunk.num_characters]
    203             blob = FormattedText(
    204                 attributes, element_text, chunk.num_linebreaks, tagnums=chunk.attributes
    205             )
    206             output.append(blob)
    207             pointer += chunk.num_characters
    208         return output
    209 
    210     def as_element_tree(self):
    211         body = ET.Element("body")
    212         blobs = self._get_formatted_text_list()
    213         for blob in blobs:
    214             body.append(blob.element_tree)
    215         return body
    216 
    217     def as_html(self):
    218         output = '<!DOCTYPE html><html><head><meta charset="utf-8"/>'
    219         with open(os.path.join(dir_path, "style.css")) as f:
    220             output += "<style>" + f.read() + "</style></head>"
    221         output += ET.tostring(self.as_element_tree(), encoding="unicode").replace(
    222             "&amp;nbsp;", "&nbsp;"
    223         )
    224         return output + "</html>"
    225 
    226     def as_markdown(self):
    227         """
    228         Return this note as markdown.
    229         
    230         ## Notes about tables
    231         
    232         1. A new row starts with start with `struct-table[hash]_col[hash]` and then `struct-table[hash]_row[hash]`.
    233         2. The continuation of a row can be identified by `struct-table[hash]_row[hash]` and then `struct-table[hash]_col[hash]`
    234         3. The FormattedText that appears directly before a `struct-table[hash]_col|row[hash]` contains the content for
    235            the cell.
    236         4. There doesn't appear to be an indication of a header row
    237         5. There can be multiple blobs of data before the `struct-table[hash]_col|row[hash]` and therefore you need to
    238            capture this data so that it can be inserted into a table cell.
    239         6. BUT... there doesn't seem to be any indication that a table has finished. In some cases the last table cell
    240            will contain a \n\n but not always.
    241            Thus, this is why in this method we add the data to the stack, but then if we detect there is a table cell
    242            that hasn't been filled, we fill it with any data since the previously encountered table cell.
    243         """
    244         
    245         #: A dict of data that makes up the box note.
    246         #
    247         # The key will either be;
    248         #     1. An integer derived from the index of blobs; or
    249         #     2. A string that references a table_id derived from the Box Note attribute
    250         #        `struct-table[table-id]_col|row[hash]`
    251         out: typing.Dict[typing.Union[int, str], typing.Union[str, Table]] = {}
    252         
    253         #: A list of blob indexes that are captured so they can be placed within a table cell upon discovery.
    254         captures: typing.List[int] = []
    255         
    256         blobs = self._get_formatted_text_list()
    257         
    258         for i, blob in enumerate(blobs):
    259             if blob.table_id:
    260                 # This blob contains reference to a table
    261                 #
    262                 # Some previously captured data forms the data that will be placed within this particular table cell.
    263                 
    264                 if blob.table_id not in out:
    265                     # This is the first time we've come across this table, so creat an instance of Table in which
    266                     # we can start to place data in.
    267                     out[blob.table_id] = Table()
    268                 
    269                 # Combine any text previously captured together.
    270                 data = ''.join([
    271                     out.pop(capture)
    272                     for capture in captures
    273                 ])
    274                 
    275                 if len(data) > 0:
    276                     # Add the previously captured data to the table
    277                     #
    278                     # Table relies on a dictionary (which now honours the insertion order) to ensure that this data
    279                     # will be in the correct row/column and that that row/column is rendered in the correct place
    280                     # on output.
    281                     out[blob.table_id].add_data(blob.row_id, blob.column_id, data)
    282 
    283                 captures = []
    284 
    285             else:
    286                 if blob.num_linebreaks == 0:
    287                     # Capture a reference to this data in case it needs to be placed inside a table cell
    288                     captures.append(i)
    289                 else:
    290                     # We reset the capture when there is a line break.
    291                     captures = []
    292 
    293                 out[i] = blob.styles_to_markdown_string()
    294 
    295         doc = ''.join([
    296             o.render_markdown() if hasattr(o, "render_markdown") else o
    297             for o in out.values()
    298         ])
    299         
    300         # TODO: Better support for cleaning up the doc
    301         cleanup = (
    302             # Box Notes can give you text in a table cell like `**H****ello**` - this is invalid Markdown and we want
    303             # to convert it to `**Hello**`.
    304             ('****', ''),
    305         )
    306         
    307         for search, replace in cleanup:
    308             doc = doc.replace(search, replace)
    309             
    310         return doc
    311 
    312     def as_text(self):
    313         return self.text
    314 
    315     def __str__(self):
    316         return json.dumps(self.note_data, indent=2)