X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/8a379795621804313893da5b9ab3ff3ff1b022cf..fc1741837b1c15d0fed737b7798fe21ec7d33c64:/subdivxget?ds=sidebyside diff --git a/subdivxget b/subdivxget index 395eb35..38d46c4 100755 --- a/subdivxget +++ b/subdivxget @@ -43,6 +43,7 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = False self.subs = [] self.attr = None + self.attr_depth = 0 self.cur = None self.in_script_style = False @@ -62,12 +63,17 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': self.attr = 'titulo' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] - if self.parsing: + # br are usually not closed, so ignore them in depth calculation + if self.parsing and tag != 'br': self.depth += 1 def handle_endtag(self, tag): @@ -75,7 +81,12 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if tag == 'script' or tag == 'style': self.in_script_style = False return - self.depth -= 1 + if self.depth == self.attr_depth: + self.attr = None + self.attr_depth = 0 + # see comment in handle_starttag() + if tag != 'br': + self.depth -= 1 if self.depth == 0: self.parsing = False @@ -88,15 +99,73 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if self.in_script_style: return if self.attr is not None and data: - self.cur[self.attr] = data - self.attr = None - elif data in ('Downloads:', 'Cds:', 'Comentarios:', - 'Formato:'): + self.cur[self.attr] += ' ' + data + if self.attr_depth == 0: + self.cur[self.attr] = self.cur[self.attr].strip() + self.attr = None + self.attr_depth = 0 + elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'): self.attr = data[:-1].lower() + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'Subido por:': self.attr = 'autor' + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'el': self.attr = 'fecha' + self.attr_depth = 0 + self.cur[self.attr] = '' + + +def filter_subtitles(subs, filters): + def is_good(sub, filter): + def is_any_good(sub, filter): + for value in sub.values(): + if value.lower().find(filter) >= 0: + return True + + field = None + if len(filter) > 2 and filter[1] == ':': + field = filter[0] + filter = filter[2:] + filter = filter.lower() + + if field is None: + return is_any_good(sub, filter) + elif field == 't': + key = 'titulo' + elif field == 'd': + key = 'desc' + elif field == 'a': + key = 'autor' + elif field == 'f': + key = 'formato' + elif field == 'c': + key = 'comentarios' + elif field == 'C': + key = 'cds' + elif field == 'F': + key = 'fecha' + elif field == 'D': + key = 'downloads' + else: + # Not a recognizer field identifier, use the raw filter + return is_any_good(sub, field + ':' + filter) + + return sub[key].lower().find(filter) >= 0 + + if not filters: + return subs + + result = [] + for sub in subs: + for filter in filters: + if not is_good(sub, filter): + break + else: + result.append(sub) + return result def subdivx_get_subs(query_str): @@ -121,16 +190,19 @@ def subdivx_get_subs(query_str): return sorted(subs, key=lambda s: int(s['downloads']), reverse=True) -def get_subs(query_str): +def get_subs(query_str, filters): zip_exts = ('application/zip',) rar_exts = ('application/rar', 'application/x-rar-compressed') - for sub in subdivx_get_subs(query_str): + subs = subdivx_get_subs(query_str) + subs = filter_subtitles(subs, filters) + + for sub in subs: print '''\ - - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) - %(desc)s - DOWNLOADING ... - ''' % sub +- %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) + %(desc)s + DOWNLOADING ... +''' % sub fname, headers = urllib.urlretrieve(sub['url']) if 'Content-Type' in headers: if headers['Content-Type'] in zip_exts: @@ -152,6 +224,5 @@ def get_subs(query_str): print 'No Content-Type!' -for q in sys.argv[1:]: - get_subs(q) +get_subs(sys.argv[1], sys.argv[2:])