From: Leandro Lucarella Date: Sat, 14 Jul 2012 18:09:48 +0000 (+0200) Subject: Ignore script and style tags X-Git-Url: https://git.llucax.com/software/subdivxget.git/commitdiff_plain/4907f9c39086861e391bcfbb740e38e23b3fbea2?ds=inline Ignore script and style tags Those tags can have HTML comments inside of them, and they are not parsed by HTMLParser, so we just ignore them to avoid issues, we don't really need them for anything anyway. --- diff --git a/subdivxget b/subdivxget index 1aecc66..b5d01ff 100755 --- a/subdivxget +++ b/subdivxget @@ -43,6 +43,7 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = False self.subs = [] self.attr = None + self.in_script_style = False def handle_starttag(self, tag, attrs): attrs = dict(attrs) @@ -52,6 +53,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True if not self.parsing: return + if tag == 'script' or tag == 'style': + self.in_script_style = True + return if tag == 'div': if attrs.get('id') == 'buscador_detalle': self.parsing = True @@ -67,6 +71,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): def handle_endtag(self, tag): if self.parsing: + if tag == 'script' or tag == 'style': + self.in_script_style = False + return self.depth -= 1 if self.depth == 0: self.parsing = False @@ -75,6 +82,10 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if not self.parsing: return data = data.strip() + # Hack to handle comments in