4 if sys.version_info[0] < 3:
5 from HTMLParser import HTMLParser
6 from urllib import urlopen, urlretrieve, urlencode
7 def get_encoding(info):
8 return info.getparam('charset')
11 from html.parser import HTMLParser
12 from urllib.request import urlopen, urlretrieve
13 from urllib.parse import urlencode
14 def get_encoding(info):
15 return info.get_content_charset('ascii')
20 def output(fo, fmt, *args, **kargs):
23 fo.write((fmt % args) + '\n')
25 def echo(fmt, *args, **kargs):
26 output(sys.stdout, fmt, *args, **kargs)
28 def error(fmt, *args, **kargs):
29 output(sys.stderr, fmt, *args, **kargs)
33 def __init__(self, to_search, page_number):
34 self.host = "www.subdivx.com"
35 self.page = "/index.php"
36 self.down_page = "/bajar.php"
47 return 'http://%s%s?%s' % (self.host, self.page,
48 urlencode(self.query))
51 return self.page + '?' + urlencode(self.query)
54 return 'http://' + self.host + self.down_page
57 class SubDivXHTMLParser(HTMLParser):
62 def __init__(self, down_uri):
63 HTMLParser.__init__(self)
64 self.down_uri = down_uri
71 self.in_script_style = False
73 def handle_starttag(self, tag, attrs):
75 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
77 self.subs.append(self.cur)
81 if tag == 'script' or tag == 'style':
82 self.in_script_style = True
85 if attrs.get('id') == 'buscador_detalle':
87 elif attrs.get('id') == 'buscador_detalle_sub':
89 self.attr_depth = self.depth + 1
90 self.cur[self.attr] = ''
92 if attrs.get('class') == 'titulo_menu_izq':
94 self.attr_depth = self.depth + 1
95 self.cur[self.attr] = ''
96 elif attrs.get('href', '').startswith(self.down_uri):
97 self.cur['url'] = attrs['href']
98 # br are usually not closed, so ignore them in depth calculation
99 if self.parsing and tag != 'br':
102 def handle_endtag(self, tag):
104 if tag == 'script' or tag == 'style':
105 self.in_script_style = False
107 if self.depth == self.attr_depth:
110 # see comment in handle_starttag()
116 def handle_data(self, data):
120 # Hack to handle comments in <script> <style> which don't end
121 # up in handle_comment(), so we just ignore the whole tags
122 if self.in_script_style:
124 if self.attr is not None and data:
125 self.cur[self.attr] += ' ' + data
126 if self.attr_depth == 0:
127 self.cur[self.attr] = self.cur[self.attr].strip()
130 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
131 self.attr = data[:-1].lower()
133 self.cur[self.attr] = ''
134 elif data == 'Subido por:':
137 self.cur[self.attr] = ''
141 self.cur[self.attr] = ''
144 def filter_subtitles(subs, filters):
145 def is_good(sub, filter):
146 def is_any_good(sub, filter):
147 for value in sub.values():
148 if value.lower().find(filter) >= 0:
152 if len(filter) > 2 and filter[1] == ':':
155 filter = filter.lower()
158 return is_any_good(sub, filter)
176 # Not a recognizer field identifier, use the raw filter
177 return is_any_good(sub, field + ':' + filter)
179 return sub[key].lower().find(filter) >= 0
186 for filter in filters:
187 if not is_good(sub, filter):
194 def subdivx_get_subs(query_str):
198 query = SubDivXQuery(query_str, page_number)
199 url = urlopen(query.url)
200 parser = SubDivXHTMLParser(query.down_uri)
203 encoding = get_encoding(url.info())
208 parser.feed(line.decode(encoding))
215 subs.extend(parser.subs)
221 def unzip_subs(fname):
222 sub_exts = ('.srt', '.sub')
223 z = zipfile.ZipFile(fname, 'r')
225 for fn in z.namelist():
226 if fn.endswith(sub_exts):
227 if '..' in fn or fn.startswith('/'):
228 error('Ignoring file with dangerous name: %s',
231 echo('Extracting %s...', fn)
235 def get_subs(query_str, filters):
237 zip_exts = ('application/zip',)
238 rar_exts = ('application/rar', 'application/x-rar-compressed')
240 subs = subdivx_get_subs(query_str)
241 subs = filter_subtitles(subs, filters)
242 subs.sort(key=lambda s: int(s['downloads']), reverse=True)
246 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
252 fname, headers = urlretrieve(sub['url'])
253 if 'Content-Type' in headers:
254 if headers['Content-Type'] in zip_exts:
256 elif headers['Content-Type'] in rar_exts:
257 if subprocess.call(['rar', 'x', fname]) != 0:
258 error('Error unraring file %s', fname)
260 error('Unrecognized file type:',
261 headers['Content-Type'])
263 error('No Content-Type!')
266 def parse_args(argv):
267 from optparse import OptionParser
268 parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
270 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
271 specified, only subtitles that matches all those filters are downloaded.
272 Filters have the format "X:fitler", where X is a field specification: t=titulo,
273 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
274 filter is a string that should be found on that field (case insensitive). If
275 the format specifier is not known (or there isn't one) the filter string is
276 looked in all the fields.
278 parser.add_option("-l", "--list-only",
279 default=False, action='store_true',
280 help="Don't download the subtitles, just list them")
282 (opts, args) = parser.parse_args()
284 parser.error("Missing query string")
286 return (args[0], args[1:], opts)
288 (query_str, filters, opts) = parse_args(sys.argv)
290 get_subs(query_str, filters)