]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Move unzipping to a separate function
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 if sys.version_info[0] < 3:
5         from HTMLParser import HTMLParser
6         from urllib import urlopen, urlretrieve, urlencode
7         def get_encoding(info):
8                 return info.getparam('charset')
9
10 else:
11         from html.parser import HTMLParser
12         from urllib.request import urlopen, urlretrieve
13         from urllib.parse import urlencode
14         def get_encoding(info):
15                 return info.get_content_charset('ascii')
16 import zipfile
17 import subprocess
18
19
20 def output(fo, fmt, *args, **kargs):
21         if not args:
22                 args = kargs
23         fo.write((fmt % args) + '\n')
24
25 def echo(fmt, *args, **kargs):
26         output(sys.stdout, fmt, *args, **kargs)
27
28 def error(fmt, *args, **kargs):
29         output(sys.stderr, fmt, *args, **kargs)
30
31
32 class SubDivXQuery:
33         def __init__(self, to_search, page_number):
34                 self.host = "www.subdivx.com"
35                 self.page = "/index.php"
36                 self.down_page = "/bajar.php"
37                 self.query = dict(
38                         buscar = to_search,
39                         pg = page_number,
40                         accion = 5,
41                         masdesc = '',
42                         subtitulos = 1,
43                         realiza_b = 1,
44                 )
45         @property
46         def url(self):
47                 return 'http://%s%s?%s' % (self.host, self.page,
48                                 urlencode(self.query))
49         @property
50         def page_uri(self):
51                 return self.page + '?' + urlencode(self.query)
52         @property
53         def down_uri(self):
54                 return 'http://' + self.host + self.down_page
55
56
57 class SubDivXHTMLParser(HTMLParser):
58
59         IDLE = 1
60         HEADER = 2
61
62         def __init__(self, down_uri):
63                 HTMLParser.__init__(self)
64                 self.down_uri = down_uri
65                 self.depth = 0
66                 self.parsing = False
67                 self.subs = []
68                 self.attr = None
69                 self.attr_depth = 0
70                 self.cur = None
71                 self.in_script_style = False
72
73         def handle_starttag(self, tag, attrs):
74                 attrs = dict(attrs)
75                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
76                         self.cur = dict()
77                         self.subs.append(self.cur)
78                         self.parsing = True
79                 if not self.parsing:
80                         return
81                 if tag == 'script' or tag == 'style':
82                         self.in_script_style = True
83                         return
84                 if tag == 'div':
85                         if attrs.get('id') == 'buscador_detalle':
86                                 self.parsing = True
87                         elif attrs.get('id') == 'buscador_detalle_sub':
88                                 self.attr = 'desc'
89                                 self.attr_depth = self.depth + 1
90                                 self.cur[self.attr] = ''
91                 elif tag == 'a':
92                         if attrs.get('class') == 'titulo_menu_izq':
93                                 self.attr = 'titulo'
94                                 self.attr_depth = self.depth + 1
95                                 self.cur[self.attr] = ''
96                         elif attrs.get('href', '').startswith(self.down_uri):
97                                 self.cur['url'] = attrs['href']
98                 # br are usually not closed, so ignore them in depth calculation
99                 if self.parsing and tag != 'br':
100                         self.depth += 1
101
102         def handle_endtag(self, tag):
103                 if self.parsing:
104                         if tag == 'script' or tag == 'style':
105                                 self.in_script_style = False
106                                 return
107                         if self.depth == self.attr_depth:
108                                 self.attr = None
109                                 self.attr_depth = 0
110                         # see comment in handle_starttag()
111                         if tag != 'br':
112                                 self.depth -= 1
113                 if self.depth == 0:
114                         self.parsing = False
115
116         def handle_data(self, data):
117                 if not self.parsing:
118                         return
119                 data = data.strip()
120                 # Hack to handle comments in <script> <style> which don't end
121                 # up in handle_comment(), so we just ignore the whole tags
122                 if self.in_script_style:
123                         return
124                 if self.attr is not None and data:
125                         self.cur[self.attr] += ' ' + data
126                         if self.attr_depth == 0:
127                                 self.cur[self.attr] = self.cur[self.attr].strip()
128                                 self.attr = None
129                                 self.attr_depth = 0
130                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
131                         self.attr = data[:-1].lower()
132                         self.attr_depth = 0
133                         self.cur[self.attr] = ''
134                 elif data == 'Subido por:':
135                         self.attr = 'autor'
136                         self.attr_depth = 0
137                         self.cur[self.attr] = ''
138                 elif data == 'el':
139                         self.attr = 'fecha'
140                         self.attr_depth = 0
141                         self.cur[self.attr] = ''
142
143
144 def filter_subtitles(subs, filters):
145         def is_good(sub, filter):
146                 def is_any_good(sub, filter):
147                         for value in sub.values():
148                                 if value.lower().find(filter) >= 0:
149                                         return True
150
151                 field = None
152                 if len(filter) > 2 and filter[1] == ':':
153                         field = filter[0]
154                         filter = filter[2:]
155                 filter = filter.lower()
156
157                 if field is None:
158                         return is_any_good(sub, filter)
159                 elif field == 't':
160                         key = 'titulo'
161                 elif field == 'd':
162                         key = 'desc'
163                 elif field == 'a':
164                         key = 'autor'
165                 elif field == 'f':
166                         key = 'formato'
167                 elif field == 'c':
168                         key = 'comentarios'
169                 elif field == 'C':
170                         key = 'cds'
171                 elif field == 'F':
172                         key = 'fecha'
173                 elif field == 'D':
174                         key = 'downloads'
175                 else:
176                         # Not a recognizer field identifier, use the raw filter
177                         return is_any_good(sub, field + ':' + filter)
178
179                 return sub[key].lower().find(filter) >= 0
180
181         if not filters:
182                 return subs
183
184         result = []
185         for sub in subs:
186                 for filter in filters:
187                         if not is_good(sub, filter):
188                                 break
189                 else:
190                         result.append(sub)
191         return result
192
193
194 def subdivx_get_subs(query_str):
195         page_number = 1
196         subs = []
197         while True:
198                 query = SubDivXQuery(query_str, page_number)
199                 url = urlopen(query.url)
200                 parser = SubDivXHTMLParser(query.down_uri)
201
202                 try:
203                         encoding = get_encoding(url.info())
204                 except:
205                         encoding = 'ascii'
206
207                 for line in url:
208                         parser.feed(line.decode(encoding))
209
210                 url.close()
211
212                 if not parser.subs:
213                         break
214
215                 subs.extend(parser.subs)
216                 page_number += 1
217
218         return subs
219
220
221 def unzip_subs(fname):
222         sub_exts = ('.srt', '.sub')
223         z = zipfile.ZipFile(fname, 'r')
224         z.printdir()
225         for fn in z.namelist():
226                 if fn.endswith(sub_exts):
227                         if '..' in fn or fn.startswith('/'):
228                                 error('Ignoring file with dangerous name: %s',
229                                                 fn)
230                                 continue
231                         echo('Extracting %s...', fn)
232                         z.extract(fn)
233
234
235 def get_subs(query_str, filters):
236         zip_exts = ('application/zip',)
237         rar_exts = ('application/rar', 'application/x-rar-compressed')
238
239         subs = subdivx_get_subs(query_str)
240         subs = filter_subtitles(subs, filters)
241         subs.sort(key=lambda s: int(s['downloads']), reverse=True)
242
243         for sub in subs:
244                 echo('''\
245 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
246   %(desc)s
247         DOWNLOADING ...
248 ''', **sub)
249                 fname, headers = urlretrieve(sub['url'])
250                 if 'Content-Type' in headers:
251                         if headers['Content-Type'] in zip_exts:
252                                 unzip_subs(fname)
253                         elif headers['Content-Type'] in rar_exts:
254                                 if subprocess.call(['rar', 'x', fname]) != 0:
255                                         error('Error unraring file %s', fname)
256                         else:
257                                 error('Unrecognized file type:',
258                                                 headers['Content-Type'])
259                 else:
260                         error('No Content-Type!')
261
262
263 get_subs(sys.argv[1], sys.argv[2:])
264