1
+ from json import load
2
+ from os import makedirs , getcwd , path
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ import codecs
6
+ import sys
7
+ import urllib
8
+
9
+ # I/O encoding, see https://stackoverflow.com/questions/14630288
10
+ if sys .stdout .encoding != 'cp850' :
11
+ sys .stdout = codecs .getwriter ('cp850' )(sys .stdout , 'strict' )
12
+ if sys .stderr .encoding != 'cp850' :
13
+ sys .stderr = codecs .getwriter ('cp850' )(sys .stderr , 'strict' )
14
+
15
+ with open ('page.json' ) as data_file :
16
+ print ("Loading json.." )
17
+ pageLink = load (data_file )
18
+
19
+ for cloudType in pageLink :
20
+ dic_path = getcwd () + "/photo/" + cloudType
21
+ if not path .isdir (dic_path ):
22
+ makedirs (dic_path )
23
+ try :
24
+ link = pageLink [cloudType ][0 ]
25
+ print ("Finding all links in: " + link )
26
+ doc = requests .get (link ).text
27
+ soup = BeautifulSoup (doc )
28
+ otherLink = soup .find_all ('p' , align = 'center' )[1 ].find_all ('a' )
29
+ link_parts = link .split ("/" )
30
+ link = ""
31
+ for i in range (len (link_parts )- 1 ):
32
+ link += link_parts [i ] + "/"
33
+ for ol in otherLink :
34
+ pageLink [cloudType ].append (link + ol ["href" ])
35
+ except Exception :
36
+ print ("Error while making links." )
37
+
38
+ print ("Start to crawler photos." )
39
+ for link in pageLink [cloudType ]:
40
+ try :
41
+ print ("Dealing " + link )
42
+ doc = requests .get (link ).text
43
+ soup = BeautifulSoup (doc )
44
+ imgLink = soup .find_all ('img' , width = '160' )
45
+ for iL in imgLink :
46
+ src = iL ["src" ]
47
+ src = src .replace ("/th/" , "/" )
48
+ src = src .replace ("_th." , "." )
49
+ print ("Saving: " + src )
50
+ urllib .urlretrieve (src , dic_path + "/" + src .split ("wolken/" )[1 ].split ("/" )[1 ])
51
+ except Exception :
52
+ print ("Error saving image at link: " + link )
0 commit comments