]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Add fairly fine grained filtering support
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 import urllib
5 import zipfile
6 import subprocess
7 import HTMLParser
8
9 class SubDivXQuery:
10         def __init__(self, to_search, page_number):
11                 self.host = "www.subdivx.com"
12                 self.page = "/index.php"
13                 self.down_page = "/bajar.php"
14                 self.query = dict(
15                         buscar = to_search,
16                         pg = page_number,
17                         accion = 5,
18                         masdesc = '',
19                         subtitulos = 1,
20                         realiza_b = 1,
21                 )
22         @property
23         def url(self):
24                 return 'http://%s%s?%s' % (self.host, self.page,
25                                 urllib.urlencode(self.query))
26         @property
27         def page_uri(self):
28                 return self.page + '?' + urllib.urlencode(self.query)
29         @property
30         def down_uri(self):
31                 return 'http://' + self.host + self.down_page
32
33
34 class SubDivXHTMLParser(HTMLParser.HTMLParser):
35
36         IDLE = 1
37         HEADER = 2
38
39         def __init__(self, down_uri):
40                 HTMLParser.HTMLParser.__init__(self)
41                 self.down_uri = down_uri
42                 self.depth = 0
43                 self.parsing = False
44                 self.subs = []
45                 self.attr = None
46                 self.attr_depth = 0
47                 self.cur = None
48                 self.in_script_style = False
49
50         def handle_starttag(self, tag, attrs):
51                 attrs = dict(attrs)
52                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
53                         self.cur = dict()
54                         self.subs.append(self.cur)
55                         self.parsing = True
56                 if not self.parsing:
57                         return
58                 if tag == 'script' or tag == 'style':
59                         self.in_script_style = True
60                         return
61                 if tag == 'div':
62                         if attrs.get('id') == 'buscador_detalle':
63                                 self.parsing = True
64                         elif attrs.get('id') == 'buscador_detalle_sub':
65                                 self.attr = 'desc'
66                                 self.attr_depth = self.depth + 1
67                                 self.cur[self.attr] = ''
68                 elif tag == 'a':
69                         if attrs.get('class') == 'titulo_menu_izq':
70                                 self.attr = 'titulo'
71                                 self.attr_depth = self.depth + 1
72                                 self.cur[self.attr] = ''
73                         elif attrs.get('href', '').startswith(self.down_uri):
74                                 self.cur['url'] = attrs['href']
75                 # br are usually not closed, so ignore them in depth calculation
76                 if self.parsing and tag != 'br':
77                         self.depth += 1
78
79         def handle_endtag(self, tag):
80                 if self.parsing:
81                         if tag == 'script' or tag == 'style':
82                                 self.in_script_style = False
83                                 return
84                         if self.depth == self.attr_depth:
85                                 self.attr = None
86                                 self.attr_depth = 0
87                         # see comment in handle_starttag()
88                         if tag != 'br':
89                                 self.depth -= 1
90                 if self.depth == 0:
91                         self.parsing = False
92
93         def handle_data(self, data):
94                 if not self.parsing:
95                         return
96                 data = data.strip()
97                 # Hack to handle comments in <script> <style> which don't end
98                 # up in handle_comment(), so we just ignore the whole tags
99                 if self.in_script_style:
100                         return
101                 if self.attr is not None and data:
102                         self.cur[self.attr] += ' ' + data
103                         if self.attr_depth == 0:
104                                 self.cur[self.attr] = self.cur[self.attr].strip()
105                                 self.attr = None
106                                 self.attr_depth = 0
107                 elif data in ('Downloads:', 'Cds:', 'Comentarios:', 'Formato:'):
108                         self.attr = data[:-1].lower()
109                         self.attr_depth = 0
110                         self.cur[self.attr] = ''
111                 elif data == 'Subido por:':
112                         self.attr = 'autor'
113                         self.attr_depth = 0
114                         self.cur[self.attr] = ''
115                 elif data == 'el':
116                         self.attr = 'fecha'
117                         self.attr_depth = 0
118                         self.cur[self.attr] = ''
119
120
121 def filter_subtitles(subs, filters):
122         def is_good(sub, filter):
123                 def is_any_good(sub, filter):
124                         for value in sub.values():
125                                 if value.lower().find(filter) >= 0:
126                                         return True
127
128                 field = None
129                 if len(filter) > 2 and filter[1] == ':':
130                         field = filter[0]
131                         filter = filter[2:]
132                 filter = filter.lower()
133
134                 if field is None:
135                         return is_any_good(sub, filter)
136                 elif field == 't':
137                         key = 'titulo'
138                 elif field == 'd':
139                         key = 'desc'
140                 elif field == 'a':
141                         key = 'autor'
142                 elif field == 'f':
143                         key = 'formato'
144                 elif field == 'c':
145                         key = 'comentarios'
146                 elif field == 'C':
147                         key = 'cds'
148                 elif field == 'F':
149                         key = 'fecha'
150                 elif field == 'D':
151                         key = 'downloads'
152                 else:
153                         # Not a recognizer field identifier, use the raw filter
154                         return is_any_good(sub, field + ':' + filter)
155
156                 return sub[key].lower().find(filter) >= 0
157
158         if not filters:
159                 return subs
160
161         result = []
162         for sub in subs:
163                 for filter in filters:
164                         if not is_good(sub, filter):
165                                 break
166                 else:
167                         result.append(sub)
168         return result
169
170
171 def subdivx_get_subs(query_str):
172         page_number = 1
173         subs = []
174         while True:
175                 query = SubDivXQuery(query_str, page_number)
176                 url = urllib.urlopen(query.url)
177                 parser = SubDivXHTMLParser(query.down_uri)
178
179                 for line in url:
180                         parser.feed(line)
181
182                 url.close()
183
184                 if not parser.subs:
185                         break
186
187                 subs.extend(parser.subs)
188                 page_number += 1
189
190         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
191
192
193 def get_subs(query_str, filters):
194         zip_exts = ('application/zip',)
195         rar_exts = ('application/rar', 'application/x-rar-compressed')
196
197         subs = subdivx_get_subs(query_str)
198         subs = filter_subtitles(subs, filters)
199
200         for sub in subs:
201                 print '''\
202         - %(titulo)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
203           %(desc)s
204                 DOWNLOADING ...
205         ''' % sub
206                 fname, headers = urllib.urlretrieve(sub['url'])
207                 if 'Content-Type' in headers:
208                         if headers['Content-Type'] in zip_exts:
209                                 z = zipfile.ZipFile(fname, 'r')
210                                 z.printdir()
211                                 for fn in z.namelist():
212                                         if fn.endswith('.srt') or fn.endswith('.sub'):
213                                                 if '..' in fn or fn.startswith('/'):
214                                                         print 'Dangerous file name:', fn
215                                                         continue
216                                                 print 'Extracting', fn, '...'
217                                                 z.extract(fn)
218                         elif headers['Content-Type'] in rar_exts:
219                                 if subprocess.call(['rar', 'x', fname]) != 0:
220                                         print 'Error unraring file %s' % fname
221                         else:
222                                 print 'Unrecognized file type:', headers['Content-Type']
223                 else:
224                         print 'No Content-Type!'
225
226
227 get_subs(sys.argv[1], sys.argv[2:])
228