4 if sys.version_info[0] < 3:
5 from HTMLParser import HTMLParser
6 from urllib import urlopen, urlretrieve, urlencode
7 def get_encoding(info):
8 return info.getparam('charset')
11 from html.parser import HTMLParser
12 from urllib.request import urlopen, urlretrieve
13 from urllib.parse import urlencode
14 def get_encoding(info):
15 return info.get_content_charset('ascii')
20 def output(fo, fmt, *args, **kargs):
23 fo.write((fmt % args) + '\n')
25 def echo(fmt, *args, **kargs):
26 output(sys.stdout, fmt, *args, **kargs)
28 def error(fmt, *args, **kargs):
29 output(sys.stderr, fmt, *args, **kargs)
33 def __init__(self, to_search, page_number):
34 self.host = "www.subdivx.com"
35 self.page = "/index.php"
36 self.down_page = "/bajar.php"
47 return 'http://%s%s?%s' % (self.host, self.page,
48 urlencode(self.query))
51 return self.page + '?' + urlencode(self.query)
54 return 'http://' + self.host + self.down_page
57 class SubDivXHTMLParser(HTMLParser):
62 def __init__(self, down_uri):
63 HTMLParser.__init__(self)
64 self.down_uri = down_uri
71 self.in_script_style = False
73 def handle_starttag(self, tag, attrs):
75 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
77 self.subs.append(self.cur)
81 if tag == 'script' or tag == 'style':
82 self.in_script_style = True
85 if attrs.get('id') == 'buscador_detalle':
87 elif attrs.get('id') == 'buscador_detalle_sub':
89 self.attr_depth = self.depth + 1
90 self.cur[self.attr] = ''
92 if attrs.get('class') == 'titulo_menu_izq':
94 self.attr_depth = self.depth + 1
95 self.cur[self.attr] = ''
96 elif attrs.get('href', '').startswith(self.down_uri):
97 self.cur['url'] = attrs['href']
98 # br are usually not closed, so ignore them in depth calculation
99 if self.parsing and tag != 'br':
102 def handle_endtag(self, tag):
104 if tag == 'script' or tag == 'style':
105 self.in_script_style = False
107 if self.depth == self.attr_depth:
110 # see comment in handle_starttag()
116 def handle_data(self, data):
120 # Hack to handle comments in <script> <style> which don't end
121 # up in handle_comment(), so we just ignore the whole tags
122 if self.in_script_style:
124 if self.attr is not None and data:
125 self.cur[self.attr] += ' ' + data
126 if self.attr_depth == 0:
127 self.cur[self.attr] = self.cur[self.attr].strip()
130 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
131 self.attr = data[:-1].lower()
133 self.cur[self.attr] = ''
134 elif data == 'Subido por:':
137 self.cur[self.attr] = ''
141 self.cur[self.attr] = ''
144 def filter_subtitles(subs, filters):
145 def is_good(sub, filter):
146 def is_any_good(sub, filter):
147 for value in sub.values():
148 if value.lower().find(filter) >= 0:
152 if len(filter) > 2 and filter[1] == ':':
155 filter = filter.lower()
158 return is_any_good(sub, filter)
176 # Not a recognizer field identifier, use the raw filter
177 return is_any_good(sub, field + ':' + filter)
179 return sub[key].lower().find(filter) >= 0
186 for filter in filters:
187 if not is_good(sub, filter):
194 def subdivx_get_subs(query_str):
198 query = SubDivXQuery(query_str, page_number)
199 url = urlopen(query.url)
200 parser = SubDivXHTMLParser(query.down_uri)
203 encoding = get_encoding(url.info())
208 parser.feed(line.decode(encoding))
215 subs.extend(parser.subs)
221 def get_subs(query_str, filters):
222 sub_exts = ('.srt', '.sub')
223 zip_exts = ('application/zip',)
224 rar_exts = ('application/rar', 'application/x-rar-compressed')
226 subs = subdivx_get_subs(query_str)
227 subs = filter_subtitles(subs, filters)
228 subs.sort(key=lambda s: int(s['downloads']), reverse=True)
232 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
236 fname, headers = urlretrieve(sub['url'])
237 if 'Content-Type' in headers:
238 if headers['Content-Type'] in zip_exts:
239 z = zipfile.ZipFile(fname, 'r')
241 for fn in z.namelist():
242 if fn.endswith(sub_exts):
243 if '..' in fn or fn.startswith('/'):
244 error('Dangerous file name: %s', fn)
246 echo('Extracting %s...', fn)
248 elif headers['Content-Type'] in rar_exts:
249 if subprocess.call(['rar', 'x', fname]) != 0:
250 error('Error unraring file %s', fname)
252 error('Unrecognized file type:',
253 headers['Content-Type'])
255 error('No Content-Type!')
258 get_subs(sys.argv[1], sys.argv[2:])