]> git.llucax.com Git - software/subdivxget.git/blob - subdivxget
Initialize SubDivXQuery.cur
[software/subdivxget.git] / subdivxget
1 #!/usr/bin/env python
2
3 import sys
4 import urllib
5 import zipfile
6 import subprocess
7 import HTMLParser
8
9 class SubDivXQuery:
10         def __init__(self, to_search, page_number):
11                 self.host = "www.subdivx.com"
12                 self.page = "/index.php"
13                 self.down_page = "/bajar.php"
14                 self.query = dict(
15                         buscar = to_search,
16                         pg = page_number,
17                         accion = 5,
18                         masdesc = '',
19                         subtitulos = 1,
20                         realiza_b = 1,
21                 )
22         @property
23         def url(self):
24                 return 'http://%s%s?%s' % (self.host, self.page,
25                                 urllib.urlencode(self.query))
26         @property
27         def page_uri(self):
28                 return self.page + '?' + urllib.urlencode(self.query)
29         @property
30         def down_uri(self):
31                 return 'http://' + self.host + self.down_page
32
33
34 class SubDivXHTMLParser(HTMLParser.HTMLParser):
35
36         IDLE = 1
37         HEADER = 2
38
39         def __init__(self, down_uri):
40                 HTMLParser.HTMLParser.__init__(self)
41                 self.down_uri = down_uri
42                 self.depth = 0
43                 self.parsing = False
44                 self.subs = []
45                 self.attr = None
46                 self.cur = None
47                 self.in_script_style = False
48
49         def handle_starttag(self, tag, attrs):
50                 attrs = dict(attrs)
51                 if tag == 'div' and attrs.get('id') == 'menu_detalle_buscador':
52                         self.cur = dict()
53                         self.subs.append(self.cur)
54                         self.parsing = True
55                 if not self.parsing:
56                         return
57                 if tag == 'script' or tag == 'style':
58                         self.in_script_style = True
59                         return
60                 if tag == 'div':
61                         if attrs.get('id') == 'buscador_detalle':
62                                 self.parsing = True
63                         elif attrs.get('id') == 'buscador_detalle_sub':
64                                 self.attr = 'desc'
65                 elif tag == 'a':
66                         if attrs.get('class') == 'titulo_menu_izq':
67                                 self.attr = 'title'
68                         elif attrs.get('href', '').startswith(self.down_uri):
69                                 self.cur['url'] = attrs['href']
70                 if self.parsing:
71                         self.depth += 1
72
73         def handle_endtag(self, tag):
74                 if self.parsing:
75                         if tag == 'script' or tag == 'style':
76                                 self.in_script_style = False
77                                 return
78                         self.depth -= 1
79                 if self.depth == 0:
80                         self.parsing = False
81
82         def handle_data(self, data):
83                 if not self.parsing:
84                         return
85                 data = data.strip()
86                 # Hack to handle comments in <script> <style> which don't end
87                 # up in handle_comment(), so we just ignore the whole tags
88                 if self.in_script_style:
89                         return
90                 if self.attr is not None and data:
91                         self.cur[self.attr] = data
92                         self.attr = None
93                 elif data in ('Downloads:', 'Cds:', 'Comentarios:',
94                                 'Formato:'):
95                         self.attr = data[:-1].lower()
96                 elif data == 'Subido por:':
97                         self.attr = 'autor'
98                 elif data == 'el':
99                         self.attr = 'fecha'
100
101
102 def subdivx_get_subs(query_str):
103         page_number = 1
104         subs = []
105         while True:
106                 query = SubDivXQuery(query_str, page_number)
107                 url = urllib.urlopen(query.url)
108                 parser = SubDivXHTMLParser(query.down_uri)
109
110                 for line in url:
111                         parser.feed(line)
112
113                 url.close()
114
115                 if not parser.subs:
116                         break
117
118                 subs.extend(parser.subs)
119                 page_number += 1
120
121         return sorted(subs, key=lambda s: int(s['downloads']), reverse=True)
122
123
124 def get_subs(query_str):
125         zip_exts = ('application/zip',)
126         rar_exts = ('application/rar', 'application/x-rar-compressed')
127
128         for sub in subdivx_get_subs(query_str):
129                 print '''\
130         - %(title)s (%(autor)s - %(fecha)s - %(downloads)s - %(comentarios)s)
131           %(desc)s
132                 DOWNLOADING ...
133         ''' % sub
134                 fname, headers = urllib.urlretrieve(sub['url'])
135                 if 'Content-Type' in headers:
136                         if headers['Content-Type'] in zip_exts:
137                                 z = zipfile.ZipFile(fname, 'r')
138                                 z.printdir()
139                                 for fn in z.namelist():
140                                         if fn.endswith('.srt') or fn.endswith('.sub'):
141                                                 if '..' in fn or fn.startswith('/'):
142                                                         print 'Dangerous file name:', fn
143                                                         continue
144                                                 print 'Extracting', fn, '...'
145                                                 z.extract(fn)
146                         elif headers['Content-Type'] in rar_exts:
147                                 if subprocess.call(['rar', 'x', fname]) != 0:
148                                         print 'Error unraring file %s' % fname
149                         else:
150                                 print 'Unrecognized file type:', headers['Content-Type']
151                 else:
152                         print 'No Content-Type!'
153
154
155 for q in sys.argv[1:]:
156         get_subs(q)
157