Skip to content
This repository was archived by the owner on Jun 2, 2024. It is now read-only.

Commit ebc43fd

Browse files
committed
init
1 parent 6a68936 commit ebc43fd

File tree

3 files changed

+66
-0
lines changed

3 files changed

+66
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/photo/
2+
/.vscode/

crawler.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from json import load
2+
from os import makedirs, getcwd, path
3+
from bs4 import BeautifulSoup
4+
import requests
5+
import codecs
6+
import sys
7+
import urllib
8+
9+
# I/O encoding, see https://stackoverflow.com/questions/14630288
10+
if sys.stdout.encoding != 'cp850':
11+
sys.stdout = codecs.getwriter('cp850')(sys.stdout, 'strict')
12+
if sys.stderr.encoding != 'cp850':
13+
sys.stderr = codecs.getwriter('cp850')(sys.stderr, 'strict')
14+
15+
with open('page.json') as data_file:
16+
print ("Loading json..")
17+
pageLink = load(data_file)
18+
19+
for cloudType in pageLink:
20+
dic_path = getcwd() + "/photo/" + cloudType
21+
if not path.isdir(dic_path):
22+
makedirs(dic_path)
23+
try:
24+
link = pageLink[cloudType][0]
25+
print("Finding all links in: " + link)
26+
doc = requests.get(link).text
27+
soup = BeautifulSoup(doc)
28+
otherLink = soup.find_all('p', align='center')[1].find_all('a')
29+
link_parts = link.split("/")
30+
link = ""
31+
for i in range(len(link_parts)-1):
32+
link += link_parts[i] + "/"
33+
for ol in otherLink:
34+
pageLink[cloudType].append(link + ol["href"])
35+
except Exception:
36+
print("Error while making links.")
37+
38+
print("Start to crawler photos.")
39+
for link in pageLink[cloudType]:
40+
try:
41+
print("Dealing " + link)
42+
doc = requests.get(link).text
43+
soup = BeautifulSoup(doc)
44+
imgLink = soup.find_all('img', width='160')
45+
for iL in imgLink:
46+
src = iL["src"]
47+
src = src.replace("/th/", "/")
48+
src = src.replace("_th.", ".")
49+
print("Saving: " + src)
50+
urllib.urlretrieve(src, dic_path + "/" + src.split("wolken/")[1].split("/")[1])
51+
except Exception:
52+
print("Error saving image at link: " + link)

page.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"Cirrus": ["http://www.clouds-online.com/cloud_atlas/cirrus/cirrus.htm"],
3+
"Cirrostratus": ["http://www.clouds-online.com/cloud_atlas/cirrostratus/cirrostratus.htm"],
4+
"Cirrocumulus": ["http://www.clouds-online.com/cloud_atlas/cirrocumulus/cirrocumulus.htm"],
5+
"Altocumulus": ["http://www.clouds-online.com/cloud_atlas/altocumulus/altocumulus.htm"],
6+
"Altostratus": ["http://www.clouds-online.com/cloud_atlas/altostratus/altostratus.htm"],
7+
"Stratocumulus": ["http://www.clouds-online.com/cloud_atlas/stratocumulus/stratocumulus.htm"],
8+
"Stratus": ["http://www.clouds-online.com/cloud_atlas/stratus/stratus.htm"],
9+
"Nimbostratus": ["http://www.clouds-online.com/cloud_atlas/nimbostratus/nimbostratus.htm"],
10+
"Cumulus": ["http://www.clouds-online.com/cloud_atlas/cumulus/cumulus.htm"],
11+
"Cumulonimbus": ["http://www.clouds-online.com/cloud_atlas/cumulonimbus/cumulonimbus.htm"]
12+
}

0 commit comments

Comments
 (0)