from bs4 import BeautifulSoup from data import * def iterable(a): try: (x for x in a) return True except TypeError: return False def iterable_and_not_string(a): return iterable(a) and not isinstance(a, basestring) class Parser: def __init__(self, file): self._file = file self._soup = None self._data = Data() def init_parse(self): with open(self._file) as f: html = f.read() self._soup = BeautifulSoup(html, 'html.parser') def parse_prev_next(self): prev = self._soup.find(id="prev") if prev: self._data._prev = prev["href"] next_page = self._soup.find(id="next") if next_page: self._data._next = next_page["href"] def parse_title(self): title = self._soup.h1 title_string = self.get_string_from_content(title) self._data._title = title_string def get_string_from_content(self, content): result = "" for c in content: name = c.name if not name: result += c.string else: result += self.get_string_from_content(c) return result def parse_content(self, content, lvl = 0): try: content_id = content["id"] if content_id in [ "prev", "next" ]: return except (TypeError, KeyError) as e: pass name = content.name if name == "p": self.parse_paragraph(content) elif name == "ul": self.parse_list(content) elif name == "img": self.parse_image(content) elif name == "table": self.parse_table(content) elif not name: pass else: for new_content in content.contents: self.parse_content(new_content) def parse_paragraph(self, paragraph_content): if not hasattr(paragraph_content, "name"): return name = paragraph_content.name if name in ["p"]: self._data._body.append(Paragraph()) if name == "br": self._data._body[-1]._content.append(NewLineContent()) elif name == "a": self._data._body[-1]._content.append(TextContent(paragraph_content.string, paragraph_content["href"])) elif not name: self._data._body[-1]._content.append(TextContent(paragraph_content.string)) else: for content in paragraph_content.contents: self.parse_paragraph(content) def parse_list(self, list_content, indent=1): self._data._body.append(Paragraph(indent)) for content in list_content.contents: self.parse_list_item(content, indent) def parse_list_item(self, content, indent): for c in content: # remove li tag if hasattr(c, "name") and c.name == "ul": self.parse_list(c, indent+1) self._data._body.append(Paragraph(indent)) else: self.parse_paragraph(c) if content.name == "li": if self._data._body[-1]._content: last_content = self._data._body[-1]._content[-1] if isinstance(last_content, TextContent): last_content._new_line = True def parse_image(self, image_content): self._data._body.append(self.return_parsed_image(image_content)) def return_parsed_image(self, image_content): src = image_content["src"] height = -1 try: height = image_content["height"] except KeyError: pass width = -1 try: width = image_content["width"] except KeyError: pass return Image(src, width, height) def parse_body(self): for content in self._soup.body: self.parse_content(content) def parse_table(self, table_content): name = table_content.name if name == "table": self._data._body.append(Table()) for content in table_content.contents: self.parse_line(content) def parse_line(self, line_content): name = line_content.name if name == "tr": self._data._body[-1]._lines.append([]) col_index = 0 if hasattr(line_content, "contents"): for content in line_content.contents: if (hasattr(content, "name")) and content.name == "td": self.parse_cell(content, col_index) col_index += 1 def parse_cell(self, cell_content, column_index): name = cell_content.name if name == "td": # we try to access the width of each column try: width = cell_content["width"] try: if self._data._body[-1]._columns_width[column_index] != width: self._data._body[-1]._columns_width[column_index] = width except IndexError: self._data._body[-1]._columns_width.append(width) except KeyError: if column_index >= len(self._data._body[-1]._columns_width): self._data._body[-1]._columns_width.append(-1) # then we parse the cell if cell_content.contents[0].name in [None, "a", "b", "i"]: # the cell contains text self._data._body[-1]._lines[-1].append(Paragraph()) for inner_cell in cell_content.contents: self.parse_inner_cell(inner_cell) def parse_inner_cell(self, inner_cell_content): inner_name = inner_cell_content.name if not inner_name: # pure text self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string)) elif inner_name == "a": self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string, inner_cell_content["href"])) elif inner_name == "img": self._data._body[-1]._lines[-1].append(self.return_parsed_image(inner_cell_content)) else: for content in inner_cell_content.contents: self.parse_inner_cell(content) def parse(self): self.init_parse() self.parse_prev_next() self.parse_title() self.parse_body() def get_data(self): return self._data if __name__=="__main__": parser = Parser("tmp.html") parser.parse() for b in parser._data._body: print "content " b.show()