from bs4 import BeautifulSoup
from data import *

def iterable(a):
	try:
		(x for x in a)
		return True
	except TypeError:
		return False

def iterable_and_not_string(a):
	return iterable(a) and not isinstance(a, basestring)


class Parser:
	def __init__(self, file):
		self._file = file
		self._soup = None
		self._data = Data()

	def init_parse(self):
		with open(self._file) as f:
			html = f.read()
			self._soup = BeautifulSoup(html, 'html.parser')

	def parse_prev_next(self):
		prev = self._soup.find(id="prev")
		if prev:
			self._data._prev = prev["href"]

		next_page = self._soup.find(id="next")
		if next_page:
			self._data._next = next_page["href"]

	def parse_title(self):
		title = self._soup.h1
		title_string = self.get_string_from_content(title)
		self._data._title = title_string

	def get_string_from_content(self, content):
		result = ""
		for c in content:
			name = c.name
			if not name:
				result += c.string
			else:
				result += self.get_string_from_content(c)
		return result


	def parse_content(self, content, lvl = 0):
		try:
			content_id = content["id"]
			if content_id in [ "prev", "next" ]:
				return
		except (TypeError, KeyError) as e:
			pass
		name = content.name
		if name == "p":
			self.parse_paragraph(content)
		elif name == "ul":
			self.parse_list(content)
		elif name == "img":
			self.parse_image(content)
		elif name == "table":
			self.parse_table(content)
		elif not name:
			pass
		else:
			for new_content in content.contents:
				self.parse_content(new_content)

	def parse_paragraph(self, paragraph_content):
		if not hasattr(paragraph_content, "name"):
			return
		name = paragraph_content.name
		if name in ["p"]:
			self._data._body.append(Paragraph())

		if name == "br":
			self._data._body[-1]._content.append(NewLineContent())
		elif name == "a":
			self._data._body[-1]._content.append(TextContent(paragraph_content.string, paragraph_content["href"]))
		elif not name:
			self._data._body[-1]._content.append(TextContent(paragraph_content.string))
		else:
			for content in paragraph_content.contents:
				self.parse_paragraph(content) 

	def parse_list(self, list_content, indent=1):
		self._data._body.append(Paragraph(indent))
		for content in list_content.contents:
			self.parse_list_item(content, indent)

	def parse_list_item(self, content, indent):
		for c in content: # remove li tag
			if hasattr(c, "name") and c.name == "ul":
				self.parse_list(c, indent+1)
				self._data._body.append(Paragraph(indent))
			else:
				self.parse_paragraph(c)
		if content.name == "li":
			if self._data._body[-1]._content:
				last_content = self._data._body[-1]._content[-1]
				if isinstance(last_content, TextContent):
					last_content._new_line = True

	def parse_image(self, image_content):
		self._data._body.append(self.return_parsed_image(image_content))

	def return_parsed_image(self, image_content):
		src = image_content["src"]
		height = -1
		try:
			height = image_content["height"]
		except KeyError:
			pass
		width = -1
		try:
			width = image_content["width"]
		except KeyError:
			pass
		return Image(src, width, height)

	def parse_body(self):
		for content in self._soup.body:
			self.parse_content(content)

	def parse_table(self, table_content):
		name = table_content.name
		if name == "table":
			self._data._body.append(Table())

		for content in table_content.contents:
			self.parse_line(content)

	def parse_line(self, line_content):
		name = line_content.name
		if name == "tr":
			self._data._body[-1]._lines.append([])

			col_index = 0
			if hasattr(line_content, "contents"):
				for content in line_content.contents:
					if (hasattr(content, "name")) and content.name == "td":
						self.parse_cell(content, col_index)
						col_index += 1

	def parse_cell(self, cell_content, column_index):
		name = cell_content.name
		if name == "td":
			# we try to access the width of each column
			try:
				width = cell_content["width"]
				try:
					if self._data._body[-1]._columns_width[column_index] != width:
						self._data._body[-1]._columns_width[column_index] = width
				except IndexError:
					self._data._body[-1]._columns_width.append(width)
			except KeyError:
				if column_index >= len(self._data._body[-1]._columns_width):
					self._data._body[-1]._columns_width.append(-1)

			# then we parse the cell
			if cell_content.contents[0].name in [None, "a", "b", "i"]:
				# the cell contains text
				self._data._body[-1]._lines[-1].append(Paragraph())

			for inner_cell in cell_content.contents:
				self.parse_inner_cell(inner_cell)

	def parse_inner_cell(self, inner_cell_content):
		inner_name = inner_cell_content.name
		if not inner_name: # pure text
			self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string))
		elif inner_name == "a":
			self._data._body[-1]._lines[-1][-1]._content.append(TextContent(inner_cell_content.string, inner_cell_content["href"]))
		elif inner_name == "img":
			self._data._body[-1]._lines[-1].append(self.return_parsed_image(inner_cell_content))
		else:
			for content in inner_cell_content.contents:
				self.parse_inner_cell(content)


	def parse(self):
		self.init_parse()
		self.parse_prev_next()
		self.parse_title()
		self.parse_body()

	def get_data(self):
		return self._data


if __name__=="__main__":
	parser = Parser("tmp.html")
	parser.parse()
	for b in parser._data._body:
			print "content "
			b.show()