]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Use lowercase for help messages to match --help
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 if sys.version_info[0] < 3:
5         from HTMLParser import HTMLParser
6         from urllib import urlopen, urlretrieve, urlencode
7         def get_encoding(info):
8                 return info.getparam('charset')
9
10 else:
11         from html.parser import HTMLParser
12         from urllib.request import urlopen, urlretrieve
13         from urllib.parse import urlencode
14         def get_encoding(info):
15                 return info.get_content_charset('ascii')
16 import zipfile
17 import subprocess
18
19
20 def output(fo, fmt, *args, **kargs):
21         if not args:
22                 args = kargs
23         fo.write((fmt % args) + '\n')
24
25 def echo(fmt, *args, **kargs):
26         global opts
27         if opts.quiet:
28                 return
29         output(sys.stdout, fmt, *args, **kargs)
30
31 def error(fmt, *args, **kargs):
32         output(sys.stderr, fmt, *args, **kargs)
33
34
35 class SubDivXQuery:
36         def __init__(self, to_search, page_number):
37                 self.host = "www.subdivx.com"
38                 self.page = "/index.php"
39                 self.down_page = "/bajar.php"
40                 self.query = dict(
41                         buscar = to_search,
42                         pg = page_number,
43                         accion = 5,
44                         masdesc = '',
45                         subtitulos = 1,
46                         realiza_b = 1,
47                 )
48         @property
49         def url(self):
50                 return 'http://%s%s?%s' % (self.host, self.page,
51                                 urlencode(self.query))
52         @property
53         def page_uri(self):
54                 return self.page + '?' + urlencode(self.query)
55         @property
56         def down_uri(self):
57                 return 'http://' + self.host + self.down_page
58
59
60 class SubDivXHTMLParser(HTMLParser):
61
62         IDLE = 1
63         HEADER = 2
64
65         def __init__(self, down_uri):
66                 HTMLParser.__init__(self)
67                 self.down_uri = down_uri
68                 self.depth = 0
69                 self.parsing = False
70                 self.subs = []
71                 self.attr = None
72                 self.attr_depth = 0
73                 self.cur = None
74                 self.in_script_style = False
75
76         def handle_starttag(self, tag, attrs):
77                 attrs = dict(attrs)
78                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
79                         self.cur = dict()
80                         self.subs.append(self.cur)
81                         self.parsing = True
82                 if not self.parsing:
83                         return
84                 if tag == 'script' or tag == 'style':
85                         self.in_script_style = True
86                         return
87                 if tag == 'div':
88                         if attrs.get('id') == 'buscador_detalle':
89                                 self.parsing = True
90                         elif attrs.get('id') == 'buscador_detalle_sub':
91                                 self.attr = 'desc'
92                                 self.attr_depth = self.depth + 1
93                                 self.cur[self.attr] = ''
94                 elif tag == 'a':
95                         if attrs.get('class') == 'titulo_menu_izq':
96                                 self.attr = 'titulo'
97                                 self.attr_depth = self.depth + 1
98                                 self.cur[self.attr] = ''
99                         elif attrs.get('href', '').startswith(self.down_uri):
100                                 self.cur['url'] = attrs['href']
101                 # br are usually not closed, so ignore them in depth calculation
102                 if self.parsing and tag != 'br':
103                         self.depth += 1
104
105         def handle_endtag(self, tag):
106                 if self.parsing:
107                         if tag == 'script' or tag == 'style':
108                                 self.in_script_style = False
109                                 return
110                         if self.depth == self.attr_depth:
111                                 self.attr = None
112                                 self.attr_depth = 0
113                         # see comment in handle_starttag()
114                         if tag != 'br':
115                                 self.depth -= 1
116                 if self.depth == 0:
117                         self.parsing = False
118
119         def handle_data(self, data):
120                 if not self.parsing:
121                         return
122                 data = data.strip()
123                 # Hack to handle comments in <script> <style> which don't end
124                 # up in handle_comment(), so we just ignore the whole tags
125                 if self.in_script_style:
126                         return
127                 if self.attr is not None and data:
128                         self.cur[self.attr] += ' ' + data
129                         if self.attr_depth == 0:
130                                 self.cur[self.attr] = self.cur[self.attr].strip()
131                                 self.attr = None
132                                 self.attr_depth = 0
133                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
134                         self.attr = data[:-1].lower()
135                         self.attr_depth = 0
136                         self.cur[self.attr] = ''
137                 elif data == 'Subido por:':
138                         self.attr = 'autor'
139                         self.attr_depth = 0
140                         self.cur[self.attr] = ''
141                 elif data == 'el':
142                         self.attr = 'fecha'
143                         self.attr_depth = 0
144                         self.cur[self.attr] = ''
145
146
147 def filter_subtitles(subs, filters):
148         def is_good(sub, filter):
149                 def is_any_good(sub, filter):
150                         for value in sub.values():
151                                 if value.lower().find(filter) >= 0:
152                                         return True
153
154                 field = None
155                 if len(filter) > 2 and filter[1] == ':':
156                         field = filter[0]
157                         filter = filter[2:]
158                 filter = filter.lower()
159
160                 if field is None:
161                         return is_any_good(sub, filter)
162                 elif field == 't':
163                         key = 'titulo'
164                 elif field == 'd':
165                         key = 'desc'
166                 elif field == 'a':
167                         key = 'autor'
168                 elif field == 'f':
169                         key = 'formato'
170                 elif field == 'c':
171                         key = 'comentarios'
172                 elif field == 'C':
173                         key = 'cds'
174                 elif field == 'F':
175                         key = 'fecha'
176                 elif field == 'D':
177                         key = 'downloads'
178                 else:
179                         # Not a recognizer field identifier, use the raw filter
180                         return is_any_good(sub, field + ':' + filter)
181
182                 return sub[key].lower().find(filter) >= 0
183
184         if not filters:
185                 return subs
186
187         result = []
188         for sub in subs:
189                 for filter in filters:
190                         if not is_good(sub, filter):
191                                 break
192                 else:
193                         result.append(sub)
194         return result
195
196
197 def subdivx_get_subs(query_str):
198         page_number = 1
199         subs = []
200         while True:
201                 query = SubDivXQuery(query_str, page_number)
202                 url = urlopen(query.url)
203                 parser = SubDivXHTMLParser(query.down_uri)
204
205                 try:
206                         encoding = get_encoding(url.info())
207                 except:
208                         encoding = 'ascii'
209
210                 for line in url:
211                         parser.feed(line.decode(encoding))
212
213                 url.close()
214
215                 if not parser.subs:
216                         break
217
218                 subs.extend(parser.subs)
219                 page_number += 1
220
221         return subs
222
223
224 def unzip_subs(fname):
225         sub_exts = ('.srt', '.sub')
226         z = zipfile.ZipFile(fname, 'r')
227         z.printdir()
228         for fn in z.namelist():
229                 if fn.endswith(sub_exts):
230                         if '..' in fn or fn.startswith('/'):
231                                 error('Ignoring file with dangerous name: %s',
232                                                 fn)
233                                 continue
234                         echo('Extracting %s...', fn)
235                         z.extract(fn)
236
237
238 def get_subs(query_str, filters):
239         global opts
240         zip_exts = ('application/zip',)
241         rar_exts = ('application/rar', 'application/x-rar-compressed')
242
243         subs = subdivx_get_subs(query_str)
244         subs = filter_subtitles(subs, filters)
245         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
246
247         for sub in subs:
248                 echo('''\
249 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
250   %(desc)s
251         DOWNLOADING ...
252 ''', **sub)
253                 if opts.list_only:
254                         continue
255                 fname, headers = urlretrieve(sub['url'])
256                 if 'Content-Type' in headers:
257                         if headers['Content-Type'] in zip_exts:
258                                 unzip_subs(fname)
259                         elif headers['Content-Type'] in rar_exts:
260                                 if subprocess.call(['rar', 'x', fname]) != 0:
261                                         error('Error unraring file %s', fname)
262                         else:
263                                 error('Unrecognized file type:',
264                                                 headers['Content-Type'])
265                 else:
266                         error('No Content-Type!')
267
268
269 def parse_args(argv):
270         from optparse import OptionParser
271         parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
272                         description="""
273 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
274 specified, only subtitles that matches all those filters are downloaded.
275 Filters have the format "X:fitler", where X is a field specification: t=titulo,
276 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
277 filter is a string that should be found on that field (case insensitive). If
278 the format specifier is not known (or there isn't one) the filter string is
279 looked in all the fields.
280                         """.strip())
281         parser.add_option("-l", "--list-only",
282                         default=False, action='store_true',
283                         help="don't download the subtitles, just list them")
284         parser.add_option("-q", "--quiet",
285                         default=False, action='store_true',
286                         help="don't print progress messages")
287
288         (opts, args) = parser.parse_args()
289         if not args:
290               parser.error("Missing query string")
291
292         if opts.quiet and opts.list_only:
293                 parser.error("Using --quiet and --list-only together doesn't "
294                                 "make any sense")
295
296         return (args[0], args[1:], opts)
297
298 (query_str, filters, opts) = parse_args(sys.argv)
299
300 get_subs(query_str, filters)
301
302