]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Add command line option to avoid downloading
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 if sys.version_info[0] < 3:
5         from HTMLParser import HTMLParser
6         from urllib import urlopen, urlretrieve, urlencode
7         def get_encoding(info):
8                 return info.getparam('charset')
9
10 else:
11         from html.parser import HTMLParser
12         from urllib.request import urlopen, urlretrieve
13         from urllib.parse import urlencode
14         def get_encoding(info):
15                 return info.get_content_charset('ascii')
16 import zipfile
17 import subprocess
18
19
20 def output(fo, fmt, *args, **kargs):
21         if not args:
22                 args = kargs
23         fo.write((fmt % args) + '\n')
24
25 def echo(fmt, *args, **kargs):
26         output(sys.stdout, fmt, *args, **kargs)
27
28 def error(fmt, *args, **kargs):
29         output(sys.stderr, fmt, *args, **kargs)
30
31
32 class SubDivXQuery:
33         def __init__(self, to_search, page_number):
34                 self.host = "www.subdivx.com"
35                 self.page = "/index.php"
36                 self.down_page = "/bajar.php"
37                 self.query = dict(
38                         buscar = to_search,
39                         pg = page_number,
40                         accion = 5,
41                         masdesc = '',
42                         subtitulos = 1,
43                         realiza_b = 1,
44                 )
45         @property
46         def url(self):
47                 return 'http://%s%s?%s' % (self.host, self.page,
48                                 urlencode(self.query))
49         @property
50         def page_uri(self):
51                 return self.page + '?' + urlencode(self.query)
52         @property
53         def down_uri(self):
54                 return 'http://' + self.host + self.down_page
55
56
57 class SubDivXHTMLParser(HTMLParser):
58
59         IDLE = 1
60         HEADER = 2
61
62         def __init__(self, down_uri):
63                 HTMLParser.__init__(self)
64                 self.down_uri = down_uri
65                 self.depth = 0
66                 self.parsing = False
67                 self.subs = []
68                 self.attr = None
69                 self.attr_depth = 0
70                 self.cur = None
71                 self.in_script_style = False
72
73         def handle_starttag(self, tag, attrs):
74                 attrs = dict(attrs)
75                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
76                         self.cur = dict()
77                         self.subs.append(self.cur)
78                         self.parsing = True
79                 if not self.parsing:
80                         return
81                 if tag == 'script' or tag == 'style':
82                         self.in_script_style = True
83                         return
84                 if tag == 'div':
85                         if attrs.get('id') == 'buscador_detalle':
86                                 self.parsing = True
87                         elif attrs.get('id') == 'buscador_detalle_sub':
88                                 self.attr = 'desc'
89                                 self.attr_depth = self.depth + 1
90                                 self.cur[self.attr] = ''
91                 elif tag == 'a':
92                         if attrs.get('class') == 'titulo_menu_izq':
93                                 self.attr = 'titulo'
94                                 self.attr_depth = self.depth + 1
95                                 self.cur[self.attr] = ''
96                         elif attrs.get('href', '').startswith(self.down_uri):
97                                 self.cur['url'] = attrs['href']
98                 # br are usually not closed, so ignore them in depth calculation
99                 if self.parsing and tag != 'br':
100                         self.depth += 1
101
102         def handle_endtag(self, tag):
103                 if self.parsing:
104                         if tag == 'script' or tag == 'style':
105                                 self.in_script_style = False
106                                 return
107                         if self.depth == self.attr_depth:
108                                 self.attr = None
109                                 self.attr_depth = 0
110                         # see comment in handle_starttag()
111                         if tag != 'br':
112                                 self.depth -= 1
113                 if self.depth == 0:
114                         self.parsing = False
115
116         def handle_data(self, data):
117                 if not self.parsing:
118                         return
119                 data = data.strip()
120                 # Hack to handle comments in <script> <style> which don't end
121                 # up in handle_comment(), so we just ignore the whole tags
122                 if self.in_script_style:
123                         return
124                 if self.attr is not None and data:
125                         self.cur[self.attr] += ' ' + data
126                         if self.attr_depth == 0:
127                                 self.cur[self.attr] = self.cur[self.attr].strip()
128                                 self.attr = None
129                                 self.attr_depth = 0
130                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
131                         self.attr = data[:-1].lower()
132                         self.attr_depth = 0
133                         self.cur[self.attr] = ''
134                 elif data == 'Subido por:':
135                         self.attr = 'autor'
136                         self.attr_depth = 0
137                         self.cur[self.attr] = ''
138                 elif data == 'el':
139                         self.attr = 'fecha'
140                         self.attr_depth = 0
141                         self.cur[self.attr] = ''
142
143
144 def filter_subtitles(subs, filters):
145         def is_good(sub, filter):
146                 def is_any_good(sub, filter):
147                         for value in sub.values():
148                                 if value.lower().find(filter) >= 0:
149                                         return True
150
151                 field = None
152                 if len(filter) > 2 and filter[1] == ':':
153                         field = filter[0]
154                         filter = filter[2:]
155                 filter = filter.lower()
156
157                 if field is None:
158                         return is_any_good(sub, filter)
159                 elif field == 't':
160                         key = 'titulo'
161                 elif field == 'd':
162                         key = 'desc'
163                 elif field == 'a':
164                         key = 'autor'
165                 elif field == 'f':
166                         key = 'formato'
167                 elif field == 'c':
168                         key = 'comentarios'
169                 elif field == 'C':
170                         key = 'cds'
171                 elif field == 'F':
172                         key = 'fecha'
173                 elif field == 'D':
174                         key = 'downloads'
175                 else:
176                         # Not a recognizer field identifier, use the raw filter
177                         return is_any_good(sub, field + ':' + filter)
178
179                 return sub[key].lower().find(filter) >= 0
180
181         if not filters:
182                 return subs
183
184         result = []
185         for sub in subs:
186                 for filter in filters:
187                         if not is_good(sub, filter):
188                                 break
189                 else:
190                         result.append(sub)
191         return result
192
193
194 def subdivx_get_subs(query_str):
195         page_number = 1
196         subs = []
197         while True:
198                 query = SubDivXQuery(query_str, page_number)
199                 url = urlopen(query.url)
200                 parser = SubDivXHTMLParser(query.down_uri)
201
202                 try:
203                         encoding = get_encoding(url.info())
204                 except:
205                         encoding = 'ascii'
206
207                 for line in url:
208                         parser.feed(line.decode(encoding))
209
210                 url.close()
211
212                 if not parser.subs:
213                         break
214
215                 subs.extend(parser.subs)
216                 page_number += 1
217
218         return subs
219
220
221 def unzip_subs(fname):
222         sub_exts = ('.srt', '.sub')
223         z = zipfile.ZipFile(fname, 'r')
224         z.printdir()
225         for fn in z.namelist():
226                 if fn.endswith(sub_exts):
227                         if '..' in fn or fn.startswith('/'):
228                                 error('Ignoring file with dangerous name: %s',
229                                                 fn)
230                                 continue
231                         echo('Extracting %s...', fn)
232                         z.extract(fn)
233
234
235 def get_subs(query_str, filters):
236         global opts
237         zip_exts = ('application/zip',)
238         rar_exts = ('application/rar', 'application/x-rar-compressed')
239
240         subs = subdivx_get_subs(query_str)
241         subs = filter_subtitles(subs, filters)
242         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
243
244         for sub in subs:
245                 echo('''\
246 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
247   %(desc)s
248         DOWNLOADING ...
249 ''', **sub)
250                 if opts.list_only:
251                         continue
252                 fname, headers = urlretrieve(sub['url'])
253                 if 'Content-Type' in headers:
254                         if headers['Content-Type'] in zip_exts:
255                                 unzip_subs(fname)
256                         elif headers['Content-Type'] in rar_exts:
257                                 if subprocess.call(['rar', 'x', fname]) != 0:
258                                         error('Error unraring file %s', fname)
259                         else:
260                                 error('Unrecognized file type:',
261                                                 headers['Content-Type'])
262                 else:
263                         error('No Content-Type!')
264
265
266 def parse_args(argv):
267         from optparse import OptionParser
268         parser = OptionParser(usage="%prog [OPTIONS] QUERY [FILTER ...]",
269                         description="""
270 Download subtitles from subdivx.com searching the string QUERY. If FILTERs are
271 specified, only subtitles that matches all those filters are downloaded.
272 Filters have the format "X:fitler", where X is a field specification: t=titulo,
273 d=desc, a=autor, f=formato, c=comentarios, C=cds, F=fecha and D=downloads.
274 filter is a string that should be found on that field (case insensitive). If
275 the format specifier is not known (or there isn't one) the filter string is
276 looked in all the fields.
277                         """.strip())
278         parser.add_option("-l", "--list-only",
279                         default=False, action='store_true',
280                         help="Don't download the subtitles, just list them")
281
282         (opts, args) = parser.parse_args()
283         if not args:
284               parser.error("Missing query string")
285
286         return (args[0], args[1:], opts)
287
288 (query_str, filters, opts) = parse_args(sys.argv)
289
290 get_subs(query_str, filters)
291
292