first commit

7a62cf88 · Alexis Filipozzi · 7a62cf88 · 7a62cf88 · 7a62cf88 · 7a62cf88
Commit 7a62cf88 authored 9 years ago by Alexis Filipozzi
--- a/.gitignore
+++ b/.gitignore
+*.pyc
+env/
\ No newline at end of file
--- a/README.txt
+++ b/README.txt
+On peut utiliser virtualenv pour créer un environnement python.
+On installe les dépendances avec pip :
+pip install -r requirements.txt
+Pour installer lxml il faut installer paquets :
+- libxml2-dev
+- python2.7-dev
+- libxslt1-dev
\ No newline at end of file
--- a/data.py
+++ b/data.py
+from lxml import etree
+class Data:
+	def __init__(self):
+		self._title = ""
+		self._prev = ""
+		self._next = ""
+		self._body = []
+	def get_prev(self):
+		return self._prev.split(".html")[0] + ".protoxml"
+	def get_next(self):
+		return self._next.split(".html")[0] + ".protoxml"
+class Image:
+	def __init__(self, src, width=-1, height=-1):
+		self._src = src
+		self._height = height
+		self._width = width
+	def show(self):
+		print "image src: " + self._src + "(" + str(self._width) + " x " + str(self._height) + ")"
+	def to_xml(self):
+		img = etree.Element("image")
+		img.attrib["name"] = self._src
+		if self._width != -1:
+			img.attrib["width"] = self._width
+		if self._height != -1:
+			img.attrib["height"] = self._height
+		return img
+	def to_element(self, tag_name="element"):
+		el = etree.Element(tag_name)
+		layout = etree.Element("layout")
+		layout.attrib["secondary_alignment"] = "CENTER"
+		layout.attrib["gap"] = "0"
+		layout.attrib["fit_content_height"] = "true"
+		el.append(layout)
+		el.append(self.to_xml())
+		return el
+class Paragraph:
+	def __init__(self, list_ident=0):
+		self._content = []
+		self._list_indent = list_ident
+	def show(self):
+		content = ""
+		for c in self._content:
+			content += " " + c.to_str()
+		print "paragraph (" + str(self._list_indent) + "): " + content + "\n"
+	def to_xml(self):
+		text = etree.Element("text")
+		for c in self._content:
+			c_xml = c.to_xml()
+			text.append(c_xml)
+		return text
+	def to_element(self, tag_name="element"):
+		el = etree.Element(tag_name)
+		if self._list_indent != 0:
+			layout = etree.Element("layout")
+			el.append(layout)
+			left_mg = etree.Element("left_margin")
+			left_mg.text = str(10 * self._list_indent)
+			layout.append(left_mg)
+		el.append(self.to_xml())
+		return el
+class ParagraphContent:
+	pass
+class TextContent(ParagraphContent):
+	def __init__(self, text="", href="", new_line = False):
+		self._text = text
+		self._href = href
+		self._new_line = new_line
+	def get_string(self, clean = True):
+		text = self._text
+		if clean:
+			text = text.strip()
+		return text
+	def get_href(self, clean = True):
+		href = self._href
+		if clean:
+			href = href.split(".html")[0] + ".protoxml"
+		return href
+	def to_str(self):
+		return self.get_string() + (( "->" + self.get_href()) if self._href != "" else "") + "\n"
+	def to_xml(self):
+		p = etree.Element("p")
+		p.attrib["string"] = self.get_string()
+		if self._href != "":
+			p.attrib["href"] = self.get_href()
+		if self._new_line:
+			p.attrib["new_line"] = "true"
+		return p
+class NewLineContent(ParagraphContent):
+	def to_str(self):
+		return "new line\n"
+	def to_xml(self):
+		p = etree.Element("p")
+		p.attrib["new_line"] = "true"
+		return p
+class Table:
+	def __init__(self):
+		self._lines = []
+		self._columns_width = []
+	def get_column_count(self):
+		count = 0
+		for line in self._lines:
+			line_count = len(line)
+			if line_count > count:
+				count = line_count
+		return count
+	def show(self):
+		print "table"
+		print "width " + " ".join(str(x) for x in self._columns_width)
+		for line in self._lines:
+			print "new line"
+			for cell in line:
+				print "new cell"
+				cell.show()
+	def to_xml(self):
+		col_count = self.get_column_count()
+		table = etree.Element("table")
+		table.attrib["gap"] = "2"
+		table.attrib["columnCount"] = str(col_count)
+		assert(len(self._columns_width) == col_count)
+		# column_count
+		for col_index in range(col_count):
+			col_attr = etree.Element("columnAttribute")
+			col_attr.attrib["index"] = str(col_index)
+			if self._columns_width[col_index] < 0:
+				col_attr.attrib["useMaxAvailableSpace"] = "true"
+			else:
+				col_attr.attrib["useMaxAvailableSpace"] = "false"
+				col_attr.attrib["fixedColumnWidth"] = str(self._columns_width[col_index])
+			table.append(col_attr)
+		# cell
+		for line in self._lines:
+			for col_index in range(col_count):
+				if len(line) <= col_index:
+					table.append(etree.Element("cell"))
+				else:
+					cell = line[col_index]
+					if isinstance(cell, Image):
+						cell_xml = etree.Element("cell")
+						table.append(cell_xml)
+						cell_xml.append(cell.to_element("element"))
+					else:
+						table.append(cell.to_element("cell"))
+		return table
+	def to_element(self):
+		element = etree.Element("element")
+		element.append(self.to_xml())
+		return element
\ No newline at end of file
--- a/formatter.py
+++ b/formatter.py
+from data import *
+from lxml import etree
+import os
+class Formatter:
+	def __init__(self, data):
+		self._data = data
+		self._root = None
+		self._body = None
+		self._extra = None
+	def create_root(self):
+		root = etree.Element("package")
+		root.attrib["name"] = "com.daysofwonder.dowml"
+		message = etree.Element("message")
+		message.attrib["class"] = "DOWMLPage"
+		root.append(message)
+		style = etree.Element("style")
+		style.attrib["inherits"] = "../guide-style.protoxml"
+		message.append(style)
+		body = etree.Element("body")
+		message.append(body)
+		layout = etree.Element("layout")
+		layout.attrib["secondary_alignment"] = "FILL"
+		layout.attrib["gap"] = "0"
+		layout.attrib["fit_content_height"] = "true"
+		body.append(layout)
+		size = etree.Element("size")
+		size.attrib["relative_width"] = "1"
+		size.attrib["relative_height"] = "1"
+		layout.append(size)
+		top_marg = etree.Element("top_margin")
+		top_marg.text = "-10"
+		layout.append(top_marg)
+		extra = etree.Element("extra_properties")
+		message.append(extra)
+		self._root = root
+		self._body = body
+		self._extra = extra
+	def format(self):
+		self.create_root()
+		self.format_extra()
+		self.format_body()
+	def format_extra(self):
+		assert(self._data._title)
+		title = etree.Element("property")
+		title.attrib["key"] = "Title"
+		title.attrib["value"] = self._data._title
+		self._extra.append(title)
+		if self._data._prev:
+			prev = etree.Element("property")
+			prev.attrib["key"] = "PreviousPage"
+			prev.attrib["value"] = self._data.get_prev()
+			self._extra.append(prev)
+		if self._data._next:
+			next_page = etree.Element("property")
+			next_page.attrib["key"] = "NextPage"
+			next_page.attrib["value"] = self._data.get_next()
+			self._extra.append(next_page)
+	def format_body(self):
+		for content in self._data._body:
+			self._body.append(content.to_element())
+	def show(self):
+		return etree.tostring(self._root, encoding='utf-8', xml_declaration=True, pretty_print=True)
+	def write_to_file(self, filename):
+		with open(filename, "w") as f:
+			f.write(self.show())
+if __name__=="__main__":
+	formatter = Formatter(None)
+	formatter.format()
+	print etree.tostring(formatter._root, pretty_print=True)
\ No newline at end of file
--- a/html_parser.py
+++ b/html_parser.py
+from bs4 import BeautifulSoup
+from data import *
+def iterable(a):
+    try:
+        (x for x in a)
+        return True
+    except TypeError:
+        return False
+def iterable_and_not_string(a):
+	return iterable(a) and not isinstance(a, basestring)
+class Parser:
+	def __init__(self, file):
+		self._file = file
+		self._soup = None
+		self._data = Data()
+	def init_parse(self):
+		with open(self._file) as f:
+			html = f.read()
+			self._soup = BeautifulSoup(html, 'html.parser')
+	def parse_prev_next(self):
+		prev = self._soup.find(id="prev")
+		if prev:
+			self._data._prev = prev["href"]
+		next_page = self._soup.find(id="next")
+		if next_page:
+			self._data._next = next_page["href"]
+	def parse_title(self):
+		title = self._soup.h1
+		if title:
+			self._data._title = title.string
+	def parse_content(self, content, lvl = 0):
+		try:
+			content_id = content["id"]
+			if content_id in [ "prev", "next" ]:
+				return
+		except (TypeError, KeyError) as e:
+			pass
+		name = content.name
+		if name == "p":
+			self.parse_paragraph(content)
+		elif name == "ul":
+			self.parse_list(content)
+		elif name == "img":
+			self.parse_image(content)
+		elif name == "table":
+			self.parse_table(content)
+		elif not name:
+			pass
+		else:
+			for new_content in content.contents:
+				self.parse_content(new_content)
+	def parse_paragraph(self, paragraph_content):
+		if not hasattr(paragraph_content, "name"):
+			return
+		name = paragraph_content.name
+		if name in ["p"]:
+			self._data._body.append(Paragraph())
+		if name == "br":
+			self._data._body[-1]._content.append(NewLineContent())
+		elif name == "a":
+			self._data._body[-1]._content.append(TextContent(paragraph_content.string, paragraph_content["href"]))
+		elif not name:
+			self._data._body[-1]._content.append(TextContent(paragraph_content.string))
+		else:
+			for content in paragraph_content.contents:
+				self.parse_paragraph(content) 
+	def parse_list(self, list_content, indent=1):
+		self._data._body.append(Paragraph(indent))
+		for content in list_content.contents:
+			self.parse_list_item(content, indent)
+	def parse_list_item(self, content, indent):
+		for c in content: # remove li tag
+			if hasattr(c, "name") and c.name == "ul":
+				self.parse_list(c, indent+1)
+				self._data._body.append(Paragraph(indent))
+			else:
+				self.parse_paragraph(c)
+				if self._data._body[-1]._content:
+					last_content = self._data._body[-1]._content[-1]
+					if isinstance(last_content, TextContent):
+						last_content._new_line = True
+	def parse_image(self, image_content):
+		self._data._body.append(self.return_parsed_image(image_content))
+	def return_parsed_image(self, image_content):
+		src = image_content["src"]
+		height = -1
+		try:
+			height = image_content["height"]
+		except KeyError:
+			pass
+		width = -1
+		try:
+			width = image_content["width"]
+		except KeyError:
+			pass
+		return Image(src, width, height)
+	def parse_body(self):
+		for content in self._soup.body:
+			self.parse_content(content)
+	def parse_table(self, table_content):
+		name = table_content.name
+		if name == "table":
+			self._data._body.append(Table())
+		for content in table_content.contents:
+			self.parse_line(content)
+	def parse_line(self, line_content):
+		name = line_content.name
+		if name == "tr":
+			self._data._body[-1]._lines.append([])
+			col_index = 0
+			if hasattr(line_content, "contents"):
+				for content in line_content.contents:
+					if (hasattr(content, "name")) and content.name == "td":
+						self.parse_cell(content, col_index)
+						col_index += 1
+	def parse_cell(self, cell_content, column_index):
+		name = cell_content.name
+		if name == "td":
+			# we try to access the width of each column
+			try:
+				width = cell_content["width"]
+				try:
+					if self._data._body[-1]._columns_width[column_index] != width:
+						self._data._body[-1]._columns_width[column_index] = width
+				except IndexError:
+					self._data._body[-1]._columns_width.append(width)
+			except KeyError:
+				if column_index >= len(self._data._body[-1]._columns_width):
+					self._data._body[-1]._columns_width.append(-1)
+			# then we parse the cell
+			if cell_content.contents[0].name in [None, "a", "b", "i"]:
+				# the cell contains text
+				self._data._body[-1]._lines[-1].append(Paragraph())
+			for inner_cell in cell_content.contents:
+				self.parse_inner_cell(inner_cell)
+	def parse_inner_cell(self, inner_cell_content):
+		inner_name = inner_cell_content.name
+		if not inner_name: # pure text
+			self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string))
+		elif inner_name == "a":
+			self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string, inner_cell_content["href"]))
+		elif inner_name == "img":
+			self._data._body[-1]._lines[-1].append(self.return_parsed_image(inner_cell_content))
+		else:
+			for content in inner_cell_content.contents:
+				self.parse_inner_cell(content)
+	def parse(self):
+		self.init_parse()
+		self.parse_prev_next()
+		self.parse_title()
+		self.parse_body()
+	def get_data(self):
+		return self._data
+if __name__=="__main__":
+	parser = Parser("tmp.html")
+	parser.parse()
+	for b in parser._data._body:
+			print "content "
+			b.show()
\ No newline at end of file
--- a/main.py
+++ b/main.py
+from formatter import Formatter
+from html_parser import Parser
+import sys
+from distutils import dir_util
+import os
+def main(argv):
+	for arg in argv:
+		main_one_arg(arg)
+def main_one_arg(directory):
+	if not os.path.exists(directory):
+		raise OSError("Directory " + str(directory) + " does not exist.")
+	destination = directory + "-protoxml"
+	if not os.path.exists(destination):
+		os.makedirs(destination)
+	for filename in os.listdir(directory):
+		parse_and_format(directory, filename, destination)
+def parse_and_format(dir_path, filename, dest):
+	assert(os.path.exists(os.path.join(dir_path, filename)))
+	path_to_file = os.path.join(dir_path, filename)
+	path_to_dst = os.path.join(dest, filename)
+	if os.path.isdir(path_to_file):
+		if filename == "images":
+			dir_util.copy_tree(path_to_file, path_to_dst)
+		else:
+			if not os.path.exists(path_to_dst):
+				os.makedirs(path_to_dst)
+			for sub_file in os.listdir(path_to_file):
+				parse_and_format(path_to_file, sub_file, path_to_dst)
+	elif os.path.isfile(path_to_file):
+		parse_format_and_write_file(path_to_file, path_to_dst)
+	else:
+		print "Unhandled file type (probably symlink) with file " + str(path_to_file)
+def parse_format_and_write_file(filename, dest):
+	print "parse file"
+	has_been_parsed = False
+	parser = None
+	formatter = None
+	try:
+		parser = Parser(filename)
+		parser.parse()
+		has_been_parsed = True
+	except Exception as e:
+		print "Error with file while parsing: " + str(filename) + " (this file might not be HTML valid enough)"
+		print "Error message: " + str(e)
+	if has_been_parsed:
+		try:
+			formatter = Formatter(parser.get_data())
+			formatter.format()
+			formatter.write_to_file(dest)
+		except Exception as e:
+			print "Error with file while fomatting: " + str(filename)
+if __name__=="__main__":
+	main(sys.argv[1:]) # don't get the first arg (it's the command name)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+BeautifulSoup
+beautifulsoup4 
+lxml
\ No newline at end of file