Skip to content
Snippets Groups Projects
Commit 7a62cf88 authored by Alexis Filipozzi's avatar Alexis Filipozzi
Browse files

first commit

parents
No related branches found
No related tags found
No related merge requests found
*.pyc
env/
\ No newline at end of file
On peut utiliser virtualenv pour créer un environnement python.
On installe les dépendances avec pip :
pip install -r requirements.txt
Pour installer lxml il faut installer paquets :
- libxml2-dev
- python2.7-dev
- libxslt1-dev
\ No newline at end of file
data.py 0 → 100644
from lxml import etree
class Data:
def __init__(self):
self._title = ""
self._prev = ""
self._next = ""
self._body = []
def get_prev(self):
return self._prev.split(".html")[0] + ".protoxml"
def get_next(self):
return self._next.split(".html")[0] + ".protoxml"
class Image:
def __init__(self, src, width=-1, height=-1):
self._src = src
self._height = height
self._width = width
def show(self):
print "image src: " + self._src + "(" + str(self._width) + " x " + str(self._height) + ")"
def to_xml(self):
img = etree.Element("image")
img.attrib["name"] = self._src
if self._width != -1:
img.attrib["width"] = self._width
if self._height != -1:
img.attrib["height"] = self._height
return img
def to_element(self, tag_name="element"):
el = etree.Element(tag_name)
layout = etree.Element("layout")
layout.attrib["secondary_alignment"] = "CENTER"
layout.attrib["gap"] = "0"
layout.attrib["fit_content_height"] = "true"
el.append(layout)
el.append(self.to_xml())
return el
class Paragraph:
def __init__(self, list_ident=0):
self._content = []
self._list_indent = list_ident
def show(self):
content = ""
for c in self._content:
content += " " + c.to_str()
print "paragraph (" + str(self._list_indent) + "): " + content + "\n"
def to_xml(self):
text = etree.Element("text")
for c in self._content:
c_xml = c.to_xml()
text.append(c_xml)
return text
def to_element(self, tag_name="element"):
el = etree.Element(tag_name)
if self._list_indent != 0:
layout = etree.Element("layout")
el.append(layout)
left_mg = etree.Element("left_margin")
left_mg.text = str(10 * self._list_indent)
layout.append(left_mg)
el.append(self.to_xml())
return el
class ParagraphContent:
pass
class TextContent(ParagraphContent):
def __init__(self, text="", href="", new_line = False):
self._text = text
self._href = href
self._new_line = new_line
def get_string(self, clean = True):
text = self._text
if clean:
text = text.strip()
return text
def get_href(self, clean = True):
href = self._href
if clean:
href = href.split(".html")[0] + ".protoxml"
return href
def to_str(self):
return self.get_string() + (( "->" + self.get_href()) if self._href != "" else "") + "\n"
def to_xml(self):
p = etree.Element("p")
p.attrib["string"] = self.get_string()
if self._href != "":
p.attrib["href"] = self.get_href()
if self._new_line:
p.attrib["new_line"] = "true"
return p
class NewLineContent(ParagraphContent):
def to_str(self):
return "new line\n"
def to_xml(self):
p = etree.Element("p")
p.attrib["new_line"] = "true"
return p
class Table:
def __init__(self):
self._lines = []
self._columns_width = []
def get_column_count(self):
count = 0
for line in self._lines:
line_count = len(line)
if line_count > count:
count = line_count
return count
def show(self):
print "table"
print "width " + " ".join(str(x) for x in self._columns_width)
for line in self._lines:
print "new line"
for cell in line:
print "new cell"
cell.show()
def to_xml(self):
col_count = self.get_column_count()
table = etree.Element("table")
table.attrib["gap"] = "2"
table.attrib["columnCount"] = str(col_count)
assert(len(self._columns_width) == col_count)
# column_count
for col_index in range(col_count):
col_attr = etree.Element("columnAttribute")
col_attr.attrib["index"] = str(col_index)
if self._columns_width[col_index] < 0:
col_attr.attrib["useMaxAvailableSpace"] = "true"
else:
col_attr.attrib["useMaxAvailableSpace"] = "false"
col_attr.attrib["fixedColumnWidth"] = str(self._columns_width[col_index])
table.append(col_attr)
# cell
for line in self._lines:
for col_index in range(col_count):
if len(line) <= col_index:
table.append(etree.Element("cell"))
else:
cell = line[col_index]
if isinstance(cell, Image):
cell_xml = etree.Element("cell")
table.append(cell_xml)
cell_xml.append(cell.to_element("element"))
else:
table.append(cell.to_element("cell"))
return table
def to_element(self):
element = etree.Element("element")
element.append(self.to_xml())
return element
\ No newline at end of file
from data import *
from lxml import etree
import os
class Formatter:
def __init__(self, data):
self._data = data
self._root = None
self._body = None
self._extra = None
def create_root(self):
root = etree.Element("package")
root.attrib["name"] = "com.daysofwonder.dowml"
message = etree.Element("message")
message.attrib["class"] = "DOWMLPage"
root.append(message)
style = etree.Element("style")
style.attrib["inherits"] = "../guide-style.protoxml"
message.append(style)
body = etree.Element("body")
message.append(body)
layout = etree.Element("layout")
layout.attrib["secondary_alignment"] = "FILL"
layout.attrib["gap"] = "0"
layout.attrib["fit_content_height"] = "true"
body.append(layout)
size = etree.Element("size")
size.attrib["relative_width"] = "1"
size.attrib["relative_height"] = "1"
layout.append(size)
top_marg = etree.Element("top_margin")
top_marg.text = "-10"
layout.append(top_marg)
extra = etree.Element("extra_properties")
message.append(extra)
self._root = root
self._body = body
self._extra = extra
def format(self):
self.create_root()
self.format_extra()
self.format_body()
def format_extra(self):
assert(self._data._title)
title = etree.Element("property")
title.attrib["key"] = "Title"
title.attrib["value"] = self._data._title
self._extra.append(title)
if self._data._prev:
prev = etree.Element("property")
prev.attrib["key"] = "PreviousPage"
prev.attrib["value"] = self._data.get_prev()
self._extra.append(prev)
if self._data._next:
next_page = etree.Element("property")
next_page.attrib["key"] = "NextPage"
next_page.attrib["value"] = self._data.get_next()
self._extra.append(next_page)
def format_body(self):
for content in self._data._body:
self._body.append(content.to_element())
def show(self):
return etree.tostring(self._root, encoding='utf-8', xml_declaration=True, pretty_print=True)
def write_to_file(self, filename):
with open(filename, "w") as f:
f.write(self.show())
if __name__=="__main__":
formatter = Formatter(None)
formatter.format()
print etree.tostring(formatter._root, pretty_print=True)
\ No newline at end of file
from bs4 import BeautifulSoup
from data import *
def iterable(a):
try:
(x for x in a)
return True
except TypeError:
return False
def iterable_and_not_string(a):
return iterable(a) and not isinstance(a, basestring)
class Parser:
def __init__(self, file):
self._file = file
self._soup = None
self._data = Data()
def init_parse(self):
with open(self._file) as f:
html = f.read()
self._soup = BeautifulSoup(html, 'html.parser')
def parse_prev_next(self):
prev = self._soup.find(id="prev")
if prev:
self._data._prev = prev["href"]
next_page = self._soup.find(id="next")
if next_page:
self._data._next = next_page["href"]
def parse_title(self):
title = self._soup.h1
if title:
self._data._title = title.string
def parse_content(self, content, lvl = 0):
try:
content_id = content["id"]
if content_id in [ "prev", "next" ]:
return
except (TypeError, KeyError) as e:
pass
name = content.name
if name == "p":
self.parse_paragraph(content)
elif name == "ul":
self.parse_list(content)
elif name == "img":
self.parse_image(content)
elif name == "table":
self.parse_table(content)
elif not name:
pass
else:
for new_content in content.contents:
self.parse_content(new_content)
def parse_paragraph(self, paragraph_content):
if not hasattr(paragraph_content, "name"):
return
name = paragraph_content.name
if name in ["p"]:
self._data._body.append(Paragraph())
if name == "br":
self._data._body[-1]._content.append(NewLineContent())
elif name == "a":
self._data._body[-1]._content.append(TextContent(paragraph_content.string, paragraph_content["href"]))
elif not name:
self._data._body[-1]._content.append(TextContent(paragraph_content.string))
else:
for content in paragraph_content.contents:
self.parse_paragraph(content)
def parse_list(self, list_content, indent=1):
self._data._body.append(Paragraph(indent))
for content in list_content.contents:
self.parse_list_item(content, indent)
def parse_list_item(self, content, indent):
for c in content: # remove li tag
if hasattr(c, "name") and c.name == "ul":
self.parse_list(c, indent+1)
self._data._body.append(Paragraph(indent))
else:
self.parse_paragraph(c)
if self._data._body[-1]._content:
last_content = self._data._body[-1]._content[-1]
if isinstance(last_content, TextContent):
last_content._new_line = True
def parse_image(self, image_content):
self._data._body.append(self.return_parsed_image(image_content))
def return_parsed_image(self, image_content):
src = image_content["src"]
height = -1
try:
height = image_content["height"]
except KeyError:
pass
width = -1
try:
width = image_content["width"]
except KeyError:
pass
return Image(src, width, height)
def parse_body(self):
for content in self._soup.body:
self.parse_content(content)
def parse_table(self, table_content):
name = table_content.name
if name == "table":
self._data._body.append(Table())
for content in table_content.contents:
self.parse_line(content)
def parse_line(self, line_content):
name = line_content.name
if name == "tr":
self._data._body[-1]._lines.append([])
col_index = 0
if hasattr(line_content, "contents"):
for content in line_content.contents:
if (hasattr(content, "name")) and content.name == "td":
self.parse_cell(content, col_index)
col_index += 1
def parse_cell(self, cell_content, column_index):
name = cell_content.name
if name == "td":
# we try to access the width of each column
try:
width = cell_content["width"]
try:
if self._data._body[-1]._columns_width[column_index] != width:
self._data._body[-1]._columns_width[column_index] = width
except IndexError:
self._data._body[-1]._columns_width.append(width)
except KeyError:
if column_index >= len(self._data._body[-1]._columns_width):
self._data._body[-1]._columns_width.append(-1)
# then we parse the cell
if cell_content.contents[0].name in [None, "a", "b", "i"]:
# the cell contains text
self._data._body[-1]._lines[-1].append(Paragraph())
for inner_cell in cell_content.contents:
self.parse_inner_cell(inner_cell)
def parse_inner_cell(self, inner_cell_content):
inner_name = inner_cell_content.name
if not inner_name: # pure text
self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string))
elif inner_name == "a":
self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string, inner_cell_content["href"]))
elif inner_name == "img":
self._data._body[-1]._lines[-1].append(self.return_parsed_image(inner_cell_content))
else:
for content in inner_cell_content.contents:
self.parse_inner_cell(content)
def parse(self):
self.init_parse()
self.parse_prev_next()
self.parse_title()
self.parse_body()
def get_data(self):
return self._data
if __name__=="__main__":
parser = Parser("tmp.html")
parser.parse()
for b in parser._data._body:
print "content "
b.show()
\ No newline at end of file
main.py 0 → 100644
from formatter import Formatter
from html_parser import Parser
import sys
from distutils import dir_util
import os
def main(argv):
for arg in argv:
main_one_arg(arg)
def main_one_arg(directory):
if not os.path.exists(directory):
raise OSError("Directory " + str(directory) + " does not exist.")
destination = directory + "-protoxml"
if not os.path.exists(destination):
os.makedirs(destination)
for filename in os.listdir(directory):
parse_and_format(directory, filename, destination)
def parse_and_format(dir_path, filename, dest):
assert(os.path.exists(os.path.join(dir_path, filename)))
path_to_file = os.path.join(dir_path, filename)
path_to_dst = os.path.join(dest, filename)
if os.path.isdir(path_to_file):
if filename == "images":
dir_util.copy_tree(path_to_file, path_to_dst)
else:
if not os.path.exists(path_to_dst):
os.makedirs(path_to_dst)
for sub_file in os.listdir(path_to_file):
parse_and_format(path_to_file, sub_file, path_to_dst)
elif os.path.isfile(path_to_file):
parse_format_and_write_file(path_to_file, path_to_dst)
else:
print "Unhandled file type (probably symlink) with file " + str(path_to_file)
def parse_format_and_write_file(filename, dest):
print "parse file"
has_been_parsed = False
parser = None
formatter = None
try:
parser = Parser(filename)
parser.parse()
has_been_parsed = True
except Exception as e:
print "Error with file while parsing: " + str(filename) + " (this file might not be HTML valid enough)"
print "Error message: " + str(e)
if has_been_parsed:
try:
formatter = Formatter(parser.get_data())
formatter.format()
formatter.write_to_file(dest)
except Exception as e:
print "Error with file while fomatting: " + str(filename)
if __name__=="__main__":
main(sys.argv[1:]) # don't get the first arg (it's the command name)
\ No newline at end of file
BeautifulSoup
beautifulsoup4
lxml
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment