X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/aa52c87e353921174d3b3874c2a261f175970a8c..fc1741837b1c15d0fed737b7798fe21ec7d33c64:/subdivxget diff --git a/subdivxget b/subdivxget index 5c5cc7a..38d46c4 100755 --- a/subdivxget +++ b/subdivxget @@ -7,12 +7,13 @@ import subprocess import HTMLParser class SubDivXQuery: - def __init__(self, to_search): + def __init__(self, to_search, page_number): self.host = "www.subdivx.com" self.page = "/index.php" self.down_page = "/bajar.php" self.query = dict( buscar = to_search, + pg = page_number, accion = 5, masdesc = '', subtitulos = 1, @@ -29,8 +30,6 @@ class SubDivXQuery: def down_uri(self): return 'http://' + self.host + self.down_page -class Subtitle: - pass class SubDivXHTMLParser(HTMLParser.HTMLParser): @@ -44,73 +43,166 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = False self.subs = [] self.attr = None + self.attr_depth = 0 + self.cur = None + self.in_script_style = False def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador': - #self.cur = Subtitle() self.cur = dict() self.subs.append(self.cur) self.parsing = True if not self.parsing: return + if tag == 'script' or tag == 'style': + self.in_script_style = True + return if tag == 'div': if attrs.get('id') == 'buscador_detalle': self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': - self.attr = 'title' + self.attr = 'titulo' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] - if self.parsing: + # br are usually not closed, so ignore them in depth calculation + if self.parsing and tag != 'br': self.depth += 1 def handle_endtag(self, tag): if self.parsing: - self.depth -= 1 + if tag == 'script' or tag == 'style': + self.in_script_style = False + return + if self.depth == self.attr_depth: + self.attr = None + self.attr_depth = 0 + # see comment in handle_starttag() + if tag != 'br': + self.depth -= 1 if self.depth == 0: self.parsing = False def handle_data(self, data): - if self.parsing: - data = data.strip() - if self.attr is not None and data: - self.cur[self.attr] = data + if not self.parsing: + return + data = data.strip() + # Hack to handle comments in