4 if sys.version_info[0] < 3:
5 from HTMLParser import HTMLParser
6 from urllib import urlopen, urlretrieve, urlencode
7 def get_encoding(info):
8 return info.getparam('charset')
11 from html.parser import HTMLParser
12 from urllib.request import urlopen, urlretrieve
13 from urllib.parse import urlencode
14 def get_encoding(info):
15 return info.get_content_charset('ascii')
20 def output(fo, fmt, *args, **kargs):
23 fo.write((fmt % args) + '\n')
25 def echo(fmt, *args, **kargs):
29 output(sys.stdout, fmt, *args, **kargs)
31 def error(fmt, *args, **kargs):
32 output(sys.stderr, fmt, *args, **kargs)
36 def __init__(self, to_search, page_number):
37 self.host = "www.subdivx.com"
38 self.page = "/index.php"
39 self.down_page = "/bajar.php"
50 return 'http://%s%s?%s' % (self.host, self.page,
51 urlencode(self.query))
54 return self.page + '?' + urlencode(self.query)
57 return 'http://' + self.host + self.down_page
60 class SubDivXHTMLParser(HTMLParser):
65 def __init__(self, down_uri):
66 HTMLParser.__init__(self)
67 self.down_uri = down_uri
74 self.in_script_style = False
76 def handle_starttag(self, tag, attrs):
78 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
80 self.subs.append(self.cur)
84 if tag == 'script' or tag == 'style':
85 self.in_script_style = True
88 if attrs.get('id') == 'buscador_detalle':
90 elif attrs.get('id') == 'buscador_detalle_sub':
92 self.attr_depth = self.depth + 1
93 self.cur[self.attr] = ''
95 if attrs.get('class') == 'titulo_menu_izq':
97 self.attr_depth = self.depth + 1
98 self.cur[self.attr] = ''
99 elif attrs.get('href', '').startswith(self.down_uri):
100 self.cur['url'] = attrs['href']
101 # br are usually not closed, so ignore them in depth calculation
102 if self.parsing and tag != 'br':
105 def handle_endtag(self, tag):
107 if tag == 'script' or tag == 'style':
108 self.in_script_style = False
110 if self.depth == self.attr_depth:
113 # see comment in handle_starttag()
119 def handle_data(self, data):
123 # Hack to handle comments in <script> <style> which don't end
124 # up in handle_comment(), so we just ignore the whole tags
125 if self.in_script_style:
127 if self.attr is not None and data:
128 self.cur[self.attr] += ' ' + data
129 if self.attr_depth == 0:
130 self.cur[self.attr] = self.cur[self.attr].strip()
133 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
134 self.attr = data[:-1].lower()
136 self.cur[self.attr] = ''
137 elif data == 'Subido por:':
140 self.cur[self.attr] = ''
144 self.cur[self.attr] = ''
147 def filter_subtitles(subs, filters):
148 def is_good(sub, filter):
149 def is_any_good(sub, filter):
150 for value in sub.values():
151 if value.lower().find(filter) >= 0:
155 if len(filter) > 2 and filter[1] == ':':
158 filter = filter.lower()
161 return is_any_good(sub, filter)
179 # Not a recognizer field identifier, use the raw filter
180 return is_any_good(sub, field + ':' + filter)
182 return sub[key].lower().find(filter) >= 0
189 for filter in filters:
190 if not is_good(sub, filter):
197 def subdivx_get_subs(query_str):
201 query = SubDivXQuery(query_str, page_number)
202 url = urlopen(query.url)
203 parser = SubDivXHTMLParser(query.down_uri)
206 encoding = get_encoding(url.info())
211 parser.feed(line.decode(encoding))
218 subs.extend(parser.subs)
224 def unzip_subs(fname):
225 sub_exts = ('.srt', '.sub')
226 z = zipfile.ZipFile(fname, 'r')
228 for fn in z.namelist():
229 if fn.endswith(sub_exts):
230 if '..' in fn or fn.startswith('/'):
231 error('Ignoring file with dangerous name: %s',
234 echo('Extracting %s...', fn)
238 def get_subs(query_str, filters):
240 zip_exts = ('application/zip',)
241 rar_exts = ('application/rar', 'application/x-rar-compressed')
243 subs = subdivx_get_subs(query_str)
244 subs = filter_subtitles(subs, filters)
245 subs.sort(key=lambda s: int(s['downloads']), reverse=True)
249 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
255 fname, headers = urlretrieve(sub['url'])
256 if 'Content-Type' in headers:
257 if headers['Content-Type'] in zip_exts:
259 elif headers['Content-Type'] in rar_exts:
260 if subprocess.call(['rar', 'x', fname]) != 0:
261 error('Error unraring file %s', fname)
263 error('Unrecognized file type:',
264 headers['Content-Type'])
266 error('No Content-Type!')
269 def parse_args(argv):
270 from optparse import OptionParser
271 parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
273 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
274 specified, only subtitles that matches all those filters are downloaded.
275 Filters have the format "X:fitler", where X is a field specification: t=titulo,
276 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
277 filter is a string that should be found on that field (case insensitive). If
278 the format specifier is not known (or there isn't one) the filter string is
279 looked in all the fields.
281 parser.add_option("-l", "--list-only",
282 default=False, action='store_true',
283 help="don't download the subtitles, just list them")
284 parser.add_option("-q", "--quiet",
285 default=False, action='store_true',
286 help="don't print progress messages")
288 (opts, args) = parser.parse_args()
290 parser.error("Missing query string")
292 if opts.quiet and opts.list_only:
293 parser.error("Using --quiet and --list-only together doesn't "
296 return (args[0], args[1:], opts)
298 (query_str, filters, opts) = parse_args(sys.argv)
300 get_subs(query_str, filters)