]> git.llucax.com Git - software/subdivxget.git/commitdiff
Ignore script and style tags
authorLeandro Lucarella <luca@llucax.com.ar>
Sat, 14 Jul 2012 18:09:48 +0000 (20:09 +0200)
committerLeandro Lucarella <luca@llucax.com.ar>
Sat, 14 Jul 2012 18:12:53 +0000 (20:12 +0200)
Those tags can have HTML comments inside of them, and they are not
parsed by HTMLParser, so we just ignore them to avoid issues, we don't
really need them for anything anyway.

subdivxget

index 1aecc66b904811165ec5b5f1bfd44798cd551d42..b5d01ff40b29e3e17fcb7826779fba85d011cf87 100755 (executable)
@@ -43,6 +43,7 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
                self.parsing = False
                self.subs = []
                self.attr = None
+               self.in_script_style = False
 
        def handle_starttag(self, tag, attrs):
                attrs = dict(attrs)
@@ -52,6 +53,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
                        self.parsing = True
                if not self.parsing:
                        return
+               if tag == 'script' or tag == 'style':
+                       self.in_script_style = True
+                       return
                if tag == 'div':
                        if attrs.get('id') == 'buscador_detalle':
                                self.parsing = True
@@ -67,6 +71,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
 
        def handle_endtag(self, tag):
                if self.parsing:
+                       if tag == 'script' or tag == 'style':
+                               self.in_script_style = False
+                               return
                        self.depth -= 1
                if self.depth == 0:
                        self.parsing = False
@@ -75,6 +82,10 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser):
                if not self.parsing:
                        return
                data = data.strip()
+               # Hack to handle comments in <script> <style> which don't end
+               # up in handle_comment(), so we just ignore the whole tags
+               if self.in_script_style:
+                       return
                if self.attr is not None and data:
                        self.cur[self.attr] = data
                        self.attr = None