From 4907f9c39086861e391bcfbb740e38e23b3fbea2 Mon Sep 17 00:00:00 2001 From: Leandro Lucarella Date: Sat, 14 Jul 2012 20:09:48 +0200 Subject: [PATCH 1/1] Ignore script and style tags Those tags can have HTML comments inside of them, and they are not parsed by HTMLParser, so we just ignore them to avoid issues, we don't really need them for anything anyway. --- subdivxget | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/subdivxget b/subdivxget index 1aecc66..b5d01ff 100755 --- a/subdivxget +++ b/subdivxget @@ -43,6 +43,7 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = False self.subs = [] self.attr = None + self.in_script_style = False def handle_starttag(self, tag, attrs): attrs = dict(attrs) @@ -52,6 +53,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True if not self.parsing: return + if tag == 'script' or tag == 'style': + self.in_script_style = True + return if tag == 'div': if attrs.get('id') == 'buscador_detalle': self.parsing = True @@ -67,6 +71,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): def handle_endtag(self, tag): if self.parsing: + if tag == 'script' or tag == 'style': + self.in_script_style = False + return self.depth -= 1 if self.depth == 0: self.parsing = False @@ -75,6 +82,10 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if not self.parsing: return data = data.strip() + # Hack to handle comments in