21
21
import os , sys , urllib , re , string , time , string , argparse
22
22
from bs4 import BeautifulSoup
23
23
24
- def logo ():
24
+ def main ():
25
+
25
26
print '''
26
- ___ __ ____ _ _ ____ __ ____ ____ ____ ____
27
- / __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
28
- ( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
29
- \___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
27
+ ___ __ ____ _ _ ____ __ ____ ____ ____ ____
28
+ / __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
29
+ ( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
30
+ \___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
30
31
31
- A simple way to retrieve fasta sequences from CAZy Database (:
32
+ A simple way to retrieve fasta sequences from CAZy Database (:
32
33
33
- This is the database creator script.
34
+ This is the database creator script.
34
35
35
- '''
36
+ '''
36
37
37
- def main ():
38
- logo ()
39
38
parser = argparse .ArgumentParser (description = 'Generate a comma separated table with information gathered from the CAZy database; internet connection is required.' )
40
39
args = parser .parse_args ()
41
40
@@ -93,7 +92,7 @@ def main():
93
92
#==============================================================================#
94
93
# Family section
95
94
#==============================================================================#
96
- soup = BeautifulSoup (urllib .urlopen (main_class_link ))
95
+ soup = BeautifulSoup (urllib .urlopen (main_class_link ), "lxml" )
97
96
# soup = BeautifulSoup(urllib.urlopen(main_class_link), 'lxml')
98
97
family_table = soup .findAll (name = 'table' )[0 ]
99
98
rows = family_table .findAll (name = 'td' )
@@ -108,7 +107,7 @@ def main():
108
107
print '> %s' % family
109
108
#
110
109
main_link = 'http://www.cazy.org/%s.html' % family
111
- family_soup = BeautifulSoup (urllib .urlopen (main_link ))
110
+ family_soup = BeautifulSoup (urllib .urlopen (main_link ), 'lxml' )
112
111
# main_link_dic = {'http://www.cazy.org/%s_all.html#pagination_PRINC' % family: '',
113
112
# 'http://www.cazy.org/%s_characterized.html#pagination_PRINC' % family: 'characterized'}
114
113
#====================#
@@ -124,14 +123,15 @@ def main():
124
123
125
124
page_zero = main_link
126
125
127
- soup = BeautifulSoup (urllib .urlopen (main_link ))
126
+ soup = BeautifulSoup (urllib .urlopen (main_link ), "lxml" )
128
127
129
128
# Get page list for the family // 1, 2, 3, 4, 5, 7
130
129
page_index_list = soup .findAll (name = 'a' , attrs = {'class' :'lien_pagination' })
131
- # page_list = ['http://www.cazy.org/' + str(l['href']) for l in page_index_list] # deprecated
130
+
132
131
if bool (page_index_list ):
133
- first_page_idx = int (page_index_list [0 ]['href' ].split ('PRINC=' )[- 1 ].split ('#' )[0 ]) # be careful with this
134
- last_page_idx = int (page_index_list [- 2 ]['href' ].split ('PRINC=' )[- 1 ].split ('#' )[0 ]) # be careful with this
132
+
133
+ first_page_idx = int (re .findall ('=(\d*)#' , str (page_index_list [0 ]))[0 ]) # be careful with this
134
+ last_page_idx = int (re .findall ('=(\d*)#' , str (page_index_list [- 2 ]))[0 ]) # be careful with this
135
135
136
136
# generate page_list
137
137
page_list = []
@@ -147,7 +147,7 @@ def main():
147
147
for link in page_list :
148
148
# print link
149
149
# tr = rows // # td = cells
150
- soup = BeautifulSoup (urllib .urlopen (link ))
150
+ soup = BeautifulSoup (urllib .urlopen (link ), "lxml" )
151
151
table = soup .find ('table' , attrs = {'class' :'listing' })
152
152
domain = ''
153
153
@@ -222,4 +222,5 @@ def main():
222
222
223
223
if __name__ == '__main__' :
224
224
main ()
225
+
225
226
# done.
0 commit comments