From: Leandro Lucarella <luca@llucax.com.ar>
Date: Sat, 14 Jul 2012 18:09:48 +0000 (+0200)
Subject: Ignore script and style tags
X-Git-Url: https://git.llucax.com/software/subdivxget.git/commitdiff_plain/4907f9c39086861e391bcfbb740e38e23b3fbea2?hp=44c70b16fa26aad38244f816c2b0ad3c3a170ecc

Ignore script and style tags

Those tags can have HTML comments inside of them, and they are not
parsed by HTMLParser, so we just ignore them to avoid issues, we don't
really need them for anything anyway.
---

diff --git a/subdivxget b/subdivxget
index 1aecc66..b5d01ff 100755
--- a/subdivxget
+++ b/subdivxget
@@ -43,6 +43,7 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 		self.parsing = False
 		self.subs = []
 		self.attr = None
+		self.in_script_style = False
 
 	def handle_starttag(self, tag, attrs):
 		attrs = dict(attrs)
@@ -52,6 +53,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 			self.parsing = True
 		if not self.parsing:
 			return
+		if tag == 'script' or tag == 'style':
+			self.in_script_style = True
+			return
 		if tag == 'div':
 			if attrs.get('id') == 'buscador_detalle':
 				self.parsing = True
@@ -67,6 +71,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 
 	def handle_endtag(self, tag):
 		if self.parsing:
+			if tag == 'script' or tag == 'style':
+				self.in_script_style = False
+				return
 			self.depth -= 1
 		if self.depth == 0:
 			self.parsing = False
@@ -75,6 +82,10 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 		if not self.parsing:
 			return
 		data = data.strip()
+		# Hack to handle comments in <script> <style> which don't end
+		# up in handle_comment(), so we just ignore the whole tags
+		if self.in_script_style:
+			return
 		if self.attr is not None and data:
 			self.cur[self.attr] = data
 			self.attr = None