10 def __init__(self, to_search, page_number):
11 self.host = "www.subdivx.com"
12 self.page = "/index.php"
13 self.down_page = "/bajar.php"
24 return 'http://%s%s?%s' % (self.host, self.page,
25 urllib.urlencode(self.query))
28 return self.page + '?' + urllib.urlencode(self.query)
31 return 'http://' + self.host + self.down_page
34 class SubDivXHTMLParser(HTMLParser.HTMLParser):
39 def __init__(self, down_uri):
40 HTMLParser.HTMLParser.__init__(self)
41 self.down_uri = down_uri
48 self.in_script_style = False
50 def handle_starttag(self, tag, attrs):
52 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
54 self.subs.append(self.cur)
58 if tag == 'script' or tag == 'style':
59 self.in_script_style = True
62 if attrs.get('id') == 'buscador_detalle':
64 elif attrs.get('id') == 'buscador_detalle_sub':
66 self.attr_depth = self.depth + 1
67 self.cur[self.attr] = ''
69 if attrs.get('class') == 'titulo_menu_izq':
71 self.attr_depth = self.depth + 1
72 self.cur[self.attr] = ''
73 elif attrs.get('href', '').startswith(self.down_uri):
74 self.cur['url'] = attrs['href']
75 # br are usually not closed, so ignore them in depth calculation
76 if self.parsing and tag != 'br':
79 def handle_endtag(self, tag):
81 if tag == 'script' or tag == 'style':
82 self.in_script_style = False
84 if self.depth == self.attr_depth:
87 # see comment in handle_starttag()
93 def handle_data(self, data):
97 # Hack to handle comments in <script> <style> which don't end
98 # up in handle_comment(), so we just ignore the whole tags
99 if self.in_script_style:
101 if self.attr is not None and data:
102 self.cur[self.attr] += ' ' + data
103 if self.attr_depth == 0:
104 self.cur[self.attr] = self.cur[self.attr].strip()
107 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
108 self.attr = data[:-1].lower()
110 self.cur[self.attr] = ''
111 elif data == 'Subido por:':
114 self.cur[self.attr] = ''
118 self.cur[self.attr] = ''
121 def filter_subtitles(subs, filters):
122 def is_good(sub, filter):
123 def is_any_good(sub, filter):
124 for value in sub.values():
125 if value.lower().find(filter) >= 0:
129 if len(filter) > 2 and filter[1] == ':':
132 filter = filter.lower()
135 return is_any_good(sub, filter)
153 # Not a recognizer field identifier, use the raw filter
154 return is_any_good(sub, field + ':' + filter)
156 return sub[key].lower().find(filter) >= 0
163 for filter in filters:
164 if not is_good(sub, filter):
171 def subdivx_get_subs(query_str):
175 query = SubDivXQuery(query_str, page_number)
176 url = urllib.urlopen(query.url)
177 parser = SubDivXHTMLParser(query.down_uri)
187 subs.extend(parser.subs)
190 return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
193 def get_subs(query_str, filters):
194 zip_exts = ('application/zip',)
195 rar_exts = ('application/rar', 'application/x-rar-compressed')
197 subs = subdivx_get_subs(query_str)
198 subs = filter_subtitles(subs, filters)
202 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
206 fname, headers = urllib.urlretrieve(sub['url'])
207 if 'Content-Type' in headers:
208 if headers['Content-Type'] in zip_exts:
209 z = zipfile.ZipFile(fname, 'r')
211 for fn in z.namelist():
212 if fn.endswith('.srt') or fn.endswith('.sub'):
213 if '..' in fn or fn.startswith('/'):
214 print 'Dangerous file name:', fn
216 print 'Extracting', fn, '...'
218 elif headers['Content-Type'] in rar_exts:
219 if subprocess.call(['rar', 'x', fname]) != 0:
220 print 'Error unraring file %s' % fname
222 print 'Unrecognized file type:', headers['Content-Type']
224 print 'No Content-Type!'
227 get_subs(sys.argv[1], sys.argv[2:])