X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/4907f9c39086861e391bcfbb740e38e23b3fbea2..8c1a098a0ed392dfb253e7fecd46353eea7e371a:/subdivxget?ds=inline diff --git a/subdivxget b/subdivxget index b5d01ff..4ebcca5 100755 --- a/subdivxget +++ b/subdivxget @@ -43,6 +43,8 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = False self.subs = [] self.attr = None + self.attr_depth = 0 + self.cur = None self.in_script_style = False def handle_starttag(self, tag, attrs): @@ -61,12 +63,17 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': - self.attr = 'title' + self.attr = 'titulo' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] - if self.parsing: + # br are usually not closed, so ignore them in depth calculation + if self.parsing and tag != 'br': self.depth += 1 def handle_endtag(self, tag): @@ -74,7 +81,12 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if tag == 'script' or tag == 'style': self.in_script_style = False return - self.depth -= 1 + if self.depth == self.attr_depth: + self.attr = None + self.attr_depth = 0 + # see comment in handle_starttag() + if tag != 'br': + self.depth -= 1 if self.depth == 0: self.parsing = False @@ -87,15 +99,23 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if self.in_script_style: return if self.attr is not None and data: - self.cur[self.attr] = data - self.attr = None - elif data in ('Downloads:', 'Cds:', 'Comentarios:', - 'Formato:'): + self.cur[self.attr] += ' ' + data + if self.attr_depth == 0: + self.cur[self.attr] = self.cur[self.attr].strip() + self.attr = None + self.attr_depth = 0 + elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'): self.attr = data[:-1].lower() + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'Subido por:': self.attr = 'autor' + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'el': self.attr = 'fecha' + self.attr_depth = 0 + self.cur[self.attr] = '' def subdivx_get_subs(query_str): @@ -126,7 +146,7 @@ def get_subs(query_str): for sub in subdivx_get_subs(query_str): print '''\ - - %(title)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) + - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) %(desc)s DOWNLOADING ... ''' % sub