diff --git a/data.py b/data.py index f600fd716c58403d77662245b55fd48594587781..4055f40350766a6631c61adebaf087809b4c2d52 100644 --- a/data.py +++ b/data.py @@ -89,6 +89,8 @@ class TextContent(ParagraphContent): def get_string(self, clean = True): text = self._text + if not text: + text = "" if clean: text = text.strip() return text diff --git a/html_parser.py b/html_parser.py index 91a1e69e3f4dab1f348e8915e36138ae76485735..22f215651a9bc05d4c93c73ca109d91fb08316e8 100644 --- a/html_parser.py +++ b/html_parser.py @@ -2,11 +2,11 @@ from bs4 import BeautifulSoup from data import * def iterable(a): - try: - (x for x in a) - return True - except TypeError: - return False + try: + (x for x in a) + return True + except TypeError: + return False def iterable_and_not_string(a): return iterable(a) and not isinstance(a, basestring) @@ -88,10 +88,11 @@ class Parser: self._data._body.append(Paragraph(indent)) else: self.parse_paragraph(c) - if self._data._body[-1]._content: - last_content = self._data._body[-1]._content[-1] - if isinstance(last_content, TextContent): - last_content._new_line = True + if content.name == "li": + if self._data._body[-1]._content: + last_content = self._data._body[-1]._content[-1] + if isinstance(last_content, TextContent): + last_content._new_line = True def parse_image(self, image_content): self._data._body.append(self.return_parsed_image(image_content)) diff --git a/main.py b/main.py index 5e65628a83670260365143600c68e962c0f94fd7..bd203ad22cdd85f584fd5263cf2717c33102eb1b 100644 --- a/main.py +++ b/main.py @@ -32,12 +32,11 @@ def parse_and_format(dir_path, filename, dest): for sub_file in os.listdir(path_to_file): parse_and_format(path_to_file, sub_file, path_to_dst) elif os.path.isfile(path_to_file): - parse_format_and_write_file(path_to_file, path_to_dst) + parse_format_and_write_file(path_to_file, os.path.join(dest, filename.replace(".html", ".protoxml"))) else: print "Unhandled file type (probably symlink) with file " + str(path_to_file) def parse_format_and_write_file(filename, dest): - print "parse file" has_been_parsed = False parser = None formatter = None