]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Group subtitle extensions in a variable
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 if sys.version_info[0] < 3:
5         from HTMLParser import HTMLParser
6         from urllib import urlopen, urlretrieve, urlencode
7         def get_encoding(info):
8                 return info.getparam('charset')
9
10 else:
11         from html.parser import HTMLParser
12         from urllib.request import urlopen, urlretrieve
13         from urllib.parse import urlencode
14         def get_encoding(info):
15                 return info.get_content_charset('ascii')
16 import zipfile
17 import subprocess
18
19 class SubDivXQuery:
20         def __init__(self, to_search, page_number):
21                 self.host = "www.subdivx.com"
22                 self.page = "/index.php"
23                 self.down_page = "/bajar.php"
24                 self.query = dict(
25                         buscar = to_search,
26                         pg = page_number,
27                         accion = 5,
28                         masdesc = '',
29                         subtitulos = 1,
30                         realiza_b = 1,
31                 )
32         @property
33         def url(self):
34                 return 'http://%s%s?%s' % (self.host, self.page,
35                                 urlencode(self.query))
36         @property
37         def page_uri(self):
38                 return self.page + '?' + urlencode(self.query)
39         @property
40         def down_uri(self):
41                 return 'http://' + self.host + self.down_page
42
43
44 class SubDivXHTMLParser(HTMLParser):
45
46         IDLE = 1
47         HEADER = 2
48
49         def __init__(self, down_uri):
50                 HTMLParser.__init__(self)
51                 self.down_uri = down_uri
52                 self.depth = 0
53                 self.parsing = False
54                 self.subs = []
55                 self.attr = None
56                 self.attr_depth = 0
57                 self.cur = None
58                 self.in_script_style = False
59
60         def handle_starttag(self, tag, attrs):
61                 attrs = dict(attrs)
62                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
63                         self.cur = dict()
64                         self.subs.append(self.cur)
65                         self.parsing = True
66                 if not self.parsing:
67                         return
68                 if tag == 'script' or tag == 'style':
69                         self.in_script_style = True
70                         return
71                 if tag == 'div':
72                         if attrs.get('id') == 'buscador_detalle':
73                                 self.parsing = True
74                         elif attrs.get('id') == 'buscador_detalle_sub':
75                                 self.attr = 'desc'
76                                 self.attr_depth = self.depth + 1
77                                 self.cur[self.attr] = ''
78                 elif tag == 'a':
79                         if attrs.get('class') == 'titulo_menu_izq':
80                                 self.attr = 'titulo'
81                                 self.attr_depth = self.depth + 1
82                                 self.cur[self.attr] = ''
83                         elif attrs.get('href', '').startswith(self.down_uri):
84                                 self.cur['url'] = attrs['href']
85                 # br are usually not closed, so ignore them in depth calculation
86                 if self.parsing and tag != 'br':
87                         self.depth += 1
88
89         def handle_endtag(self, tag):
90                 if self.parsing:
91                         if tag == 'script' or tag == 'style':
92                                 self.in_script_style = False
93                                 return
94                         if self.depth == self.attr_depth:
95                                 self.attr = None
96                                 self.attr_depth = 0
97                         # see comment in handle_starttag()
98                         if tag != 'br':
99                                 self.depth -= 1
100                 if self.depth == 0:
101                         self.parsing = False
102
103         def handle_data(self, data):
104                 if not self.parsing:
105                         return
106                 data = data.strip()
107                 # Hack to handle comments in <script> <style> which don't end
108                 # up in handle_comment(), so we just ignore the whole tags
109                 if self.in_script_style:
110                         return
111                 if self.attr is not None and data:
112                         self.cur[self.attr] += ' ' + data
113                         if self.attr_depth == 0:
114                                 self.cur[self.attr] = self.cur[self.attr].strip()
115                                 self.attr = None
116                                 self.attr_depth = 0
117                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
118                         self.attr = data[:-1].lower()
119                         self.attr_depth = 0
120                         self.cur[self.attr] = ''
121                 elif data == 'Subido por:':
122                         self.attr = 'autor'
123                         self.attr_depth = 0
124                         self.cur[self.attr] = ''
125                 elif data == 'el':
126                         self.attr = 'fecha'
127                         self.attr_depth = 0
128                         self.cur[self.attr] = ''
129
130
131 def filter_subtitles(subs, filters):
132         def is_good(sub, filter):
133                 def is_any_good(sub, filter):
134                         for value in sub.values():
135                                 if value.lower().find(filter) >= 0:
136                                         return True
137
138                 field = None
139                 if len(filter) > 2 and filter[1] == ':':
140                         field = filter[0]
141                         filter = filter[2:]
142                 filter = filter.lower()
143
144                 if field is None:
145                         return is_any_good(sub, filter)
146                 elif field == 't':
147                         key = 'titulo'
148                 elif field == 'd':
149                         key = 'desc'
150                 elif field == 'a':
151                         key = 'autor'
152                 elif field == 'f':
153                         key = 'formato'
154                 elif field == 'c':
155                         key = 'comentarios'
156                 elif field == 'C':
157                         key = 'cds'
158                 elif field == 'F':
159                         key = 'fecha'
160                 elif field == 'D':
161                         key = 'downloads'
162                 else:
163                         # Not a recognizer field identifier, use the raw filter
164                         return is_any_good(sub, field + ':' + filter)
165
166                 return sub[key].lower().find(filter) >= 0
167
168         if not filters:
169                 return subs
170
171         result = []
172         for sub in subs:
173                 for filter in filters:
174                         if not is_good(sub, filter):
175                                 break
176                 else:
177                         result.append(sub)
178         return result
179
180
181 def subdivx_get_subs(query_str):
182         page_number = 1
183         subs = []
184         while True:
185                 query = SubDivXQuery(query_str, page_number)
186                 url = urlopen(query.url)
187                 parser = SubDivXHTMLParser(query.down_uri)
188
189                 try:
190                         encoding = get_encoding(url.info())
191                 except:
192                         encoding = 'ascii'
193
194                 for line in url:
195                         parser.feed(line.decode(encoding))
196
197                 url.close()
198
199                 if not parser.subs:
200                         break
201
202                 subs.extend(parser.subs)
203                 page_number += 1
204
205         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
206
207
208 def get_subs(query_str, filters):
209         sub_exts = ('.srt', '.sub')
210         zip_exts = ('application/zip',)
211         rar_exts = ('application/rar', 'application/x-rar-compressed')
212
213         subs = subdivx_get_subs(query_str)
214         subs = filter_subtitles(subs, filters)
215
216         for sub in subs:
217                 print('''\
218 - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
219   %(desc)s
220         DOWNLOADING ...
221 ''' % sub)
222                 continue
223                 fname, headers = urlretrieve(sub['url'])
224                 if 'Content-Type' in headers:
225                         if headers['Content-Type'] in zip_exts:
226                                 z = zipfile.ZipFile(fname, 'r')
227                                 z.printdir()
228                                 for fn in z.namelist():
229                                         if fn.endswith(sub_exts):
230                                                 if '..' in fn or fn.startswith('/'):
231                                                         print('Dangerous file name:', fn)
232                                                         continue
233                                                 print('Extracting', fn, '...')
234                                                 z.extract(fn)
235                         elif headers['Content-Type'] in rar_exts:
236                                 if subprocess.call(['rar', 'x', fname]) != 0:
237                                         print('Error unraring file %s' % fname)
238                         else:
239                                 print('Unrecognized file type:', headers['Content-Type'])
240                 else:
241                         print('No Content-Type!')
242
243
244 get_subs(sys.argv[1], sys.argv[2:])
245