X-Git-Url: https://git.llucax.com/software/subdivxget.git/blobdiff_plain/ac39e3e3707a898402033588e822d01c8e6be9f9..2dee376a09210989fb2211099fa023ef3bd0ba18:/subdivxget diff --git a/subdivxget b/subdivxget index d7c32d1..597110c 100755 --- a/subdivxget +++ b/subdivxget @@ -1,10 +1,33 @@ #!/usr/bin/env python import sys -import urllib +if sys.version_info[0] < 3: + from HTMLParser import HTMLParser + from urllib import urlopen, urlretrieve, urlencode + def get_encoding(info): + return info.getparam('charset') + +else: + from html.parser import HTMLParser + from urllib.request import urlopen, urlretrieve + from urllib.parse import urlencode + def get_encoding(info): + return info.get_content_charset('ascii') import zipfile import subprocess -import HTMLParser + + +def output(fo, fmt, *args, **kargs): + if not args: + args = kargs + fo.write((fmt % args) + '\n') + +def echo(fmt, *args, **kargs): + output(sys.stdout, fmt, *args, **kargs) + +def error(fmt, *args, **kargs): + output(sys.stderr, fmt, *args, **kargs) + class SubDivXQuery: def __init__(self, to_search, page_number): @@ -22,27 +45,28 @@ class SubDivXQuery: @property def url(self): return 'http://%s%s?%s' % (self.host, self.page, - urllib.urlencode(self.query)) + urlencode(self.query)) @property def page_uri(self): - return self.page + '?' + urllib.urlencode(self.query) + return self.page + '?' + urlencode(self.query) @property def down_uri(self): return 'http://' + self.host + self.down_page -class SubDivXHTMLParser(HTMLParser.HTMLParser): +class SubDivXHTMLParser(HTMLParser): IDLE = 1 HEADER = 2 def __init__(self, down_uri): - HTMLParser.HTMLParser.__init__(self) + HTMLParser.__init__(self) self.down_uri = down_uri self.depth = 0 self.parsing = False self.subs = [] self.attr = None + self.attr_depth = 0 self.cur = None self.in_script_style = False @@ -62,9 +86,13 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): self.parsing = True elif attrs.get('id') == 'buscador_detalle_sub': self.attr = 'desc' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif tag == 'a': if attrs.get('class') == 'titulo_menu_izq': self.attr = 'titulo' + self.attr_depth = self.depth + 1 + self.cur[self.attr] = '' elif attrs.get('href', '').startswith(self.down_uri): self.cur['url'] = attrs['href'] # br are usually not closed, so ignore them in depth calculation @@ -76,6 +104,9 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if tag == 'script' or tag == 'style': self.in_script_style = False return + if self.depth == self.attr_depth: + self.attr = None + self.attr_depth = 0 # see comment in handle_starttag() if tag != 'br': self.depth -= 1 @@ -91,15 +122,73 @@ class SubDivXHTMLParser(HTMLParser.HTMLParser): if self.in_script_style: return if self.attr is not None and data: - self.cur[self.attr] = data - self.attr = None - elif data in ('Downloads:', 'Cds:', 'Comentarios:', - 'Formato:'): + self.cur[self.attr] += ' ' + data + if self.attr_depth == 0: + self.cur[self.attr] = self.cur[self.attr].strip() + self.attr = None + self.attr_depth = 0 + elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'): self.attr = data[:-1].lower() + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'Subido por:': self.attr = 'autor' + self.attr_depth = 0 + self.cur[self.attr] = '' elif data == 'el': self.attr = 'fecha' + self.attr_depth = 0 + self.cur[self.attr] = '' + + +def filter_subtitles(subs, filters): + def is_good(sub, filter): + def is_any_good(sub, filter): + for value in sub.values(): + if value.lower().find(filter) >= 0: + return True + + field = None + if len(filter) > 2 and filter[1] == ':': + field = filter[0] + filter = filter[2:] + filter = filter.lower() + + if field is None: + return is_any_good(sub, filter) + elif field == 't': + key = 'titulo' + elif field == 'd': + key = 'desc' + elif field == 'a': + key = 'autor' + elif field == 'f': + key = 'formato' + elif field == 'c': + key = 'comentarios' + elif field == 'C': + key = 'cds' + elif field == 'F': + key = 'fecha' + elif field == 'D': + key = 'downloads' + else: + # Not a recognizer field identifier, use the raw filter + return is_any_good(sub, field + ':' + filter) + + return sub[key].lower().find(filter) >= 0 + + if not filters: + return subs + + result = [] + for sub in subs: + for filter in filters: + if not is_good(sub, filter): + break + else: + result.append(sub) + return result def subdivx_get_subs(query_str): @@ -107,11 +196,16 @@ def subdivx_get_subs(query_str): subs = [] while True: query = SubDivXQuery(query_str, page_number) - url = urllib.urlopen(query.url) + url = urlopen(query.url) parser = SubDivXHTMLParser(query.down_uri) + try: + encoding = get_encoding(url.info()) + except: + encoding = 'ascii' + for line in url: - parser.feed(line) + parser.feed(line.decode(encoding)) url.close() @@ -121,40 +215,78 @@ def subdivx_get_subs(query_str): subs.extend(parser.subs) page_number += 1 - return sorted(subs, key=lambda s: int(s['downloads']), reverse=True) + return subs + + +def unzip_subs(fname): + sub_exts = ('.srt', '.sub') + z = zipfile.ZipFile(fname, 'r') + z.printdir() + for fn in z.namelist(): + if fn.endswith(sub_exts): + if '..' in fn or fn.startswith('/'): + error('Ignoring file with dangerous name: %s', + fn) + continue + echo('Extracting %s...', fn) + z.extract(fn) -def get_subs(query_str): +def get_subs(query_str, filters): + global opts zip_exts = ('application/zip',) rar_exts = ('application/rar', 'application/x-rar-compressed') - for sub in subdivx_get_subs(query_str): - print '''\ - - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) - %(desc)s - DOWNLOADING ... - ''' % sub - fname, headers = urllib.urlretrieve(sub['url']) + subs = subdivx_get_subs(query_str) + subs = filter_subtitles(subs, filters) + subs.sort(key=lambda s: int(s['downloads']), reverse=True) + + for sub in subs: + echo('''\ +- %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s) + %(desc)s + DOWNLOADING ... +''', **sub) + if opts.list_only: + continue + fname, headers = urlretrieve(sub['url']) if 'Content-Type' in headers: if headers['Content-Type'] in zip_exts: - z = zipfile.ZipFile(fname, 'r') - z.printdir() - for fn in z.namelist(): - if fn.endswith('.srt') or fn.endswith('.sub'): - if '..' in fn or fn.startswith('/'): - print 'Dangerous file name:', fn - continue - print 'Extracting', fn, '...' - z.extract(fn) + unzip_subs(fname) elif headers['Content-Type'] in rar_exts: if subprocess.call(['rar', 'x', fname]) != 0: - print 'Error unraring file %s' % fname + error('Error unraring file %s', fname) else: - print 'Unrecognized file type:', headers['Content-Type'] + error('Unrecognized file type:', + headers['Content-Type']) else: - print 'No Content-Type!' + error('No Content-Type!') + + +def parse_args(argv): + from optparse import OptionParser + parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]", + description=""" +Download subtitles from subdivx.com searching the string QUERY. If FILTERs are +specified, only subtitles that matches all those filters are downloaded. +Filters have the format "X:fitler", where X is a field specification: t=titulo, +d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads. +filter is a string that should be found on that field (case insensitive). If +the format specifier is not known (or there isn't one) the filter string is +looked in all the fields. + """.strip()) + parser.add_option("-l", "--list-only", + default=False, action='store_true', + help="Don't download the subtitles, just list them") + + (opts, args) = parser.parse_args() + if not args: + parser.error("Missing query string") + + return (args[0], args[1:], opts) + +(query_str, filters, opts) = parse_args(sys.argv) +get_subs(query_str, filters) -for q in sys.argv[1:]: - get_subs(q)