Skip to content

Commit e869099

Browse files
author
rvargas
committed
updated page indexes
1 parent ce0c5ee commit e869099

File tree

2 files changed

+26
-28
lines changed

2 files changed

+26
-28
lines changed

cazy_parser/create_cazy_db.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,20 @@
2121
import os, sys, urllib, re, string, time, string, argparse
2222
from bs4 import BeautifulSoup
2323

24-
def logo():
24+
def main():
25+
2526
print '''
26-
___ __ ____ _ _ ____ __ ____ ____ ____ ____
27-
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
28-
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
29-
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
27+
___ __ ____ _ _ ____ __ ____ ____ ____ ____
28+
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
29+
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
30+
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
3031
31-
A simple way to retrieve fasta sequences from CAZy Database (:
32+
A simple way to retrieve fasta sequences from CAZy Database (:
3233
33-
This is the database creator script.
34+
This is the database creator script.
3435
35-
'''
36+
'''
3637

37-
def main():
38-
logo()
3938
parser = argparse.ArgumentParser(description='Generate a comma separated table with information gathered from the CAZy database; internet connection is required.')
4039
args = parser.parse_args()
4140

@@ -93,7 +92,7 @@ def main():
9392
#==============================================================================#
9493
# Family section
9594
#==============================================================================#
96-
soup = BeautifulSoup(urllib.urlopen(main_class_link))
95+
soup = BeautifulSoup(urllib.urlopen(main_class_link), "lxml")
9796
# soup = BeautifulSoup(urllib.urlopen(main_class_link), 'lxml')
9897
family_table = soup.findAll(name='table')[0]
9998
rows = family_table.findAll(name='td')
@@ -108,7 +107,7 @@ def main():
108107
print '> %s' % family
109108
#
110109
main_link = 'http://www.cazy.org/%s.html' % family
111-
family_soup = BeautifulSoup(urllib.urlopen(main_link))
110+
family_soup = BeautifulSoup(urllib.urlopen(main_link), 'lxml')
112111
# main_link_dic = {'http://www.cazy.org/%s_all.html#pagination_PRINC' % family: '',
113112
# 'http://www.cazy.org/%s_characterized.html#pagination_PRINC' % family: 'characterized'}
114113
#====================#
@@ -124,14 +123,15 @@ def main():
124123

125124
page_zero = main_link
126125

127-
soup = BeautifulSoup(urllib.urlopen(main_link))
126+
soup = BeautifulSoup(urllib.urlopen(main_link), "lxml")
128127

129128
# Get page list for the family // 1, 2, 3, 4, 5, 7
130129
page_index_list = soup.findAll(name = 'a', attrs={'class':'lien_pagination'})
131-
# page_list = ['http://www.cazy.org/' + str(l['href']) for l in page_index_list] # deprecated
130+
132131
if bool(page_index_list):
133-
first_page_idx = int(page_index_list[0]['href'].split('PRINC=')[-1].split('#')[0]) # be careful with this
134-
last_page_idx = int(page_index_list[-2]['href'].split('PRINC=')[-1].split('#')[0]) # be careful with this
132+
133+
first_page_idx = int(re.findall('=(\d*)#', str(page_index_list[0]))[0]) # be careful with this
134+
last_page_idx = int(re.findall('=(\d*)#', str(page_index_list[-2]))[0]) # be careful with this
135135

136136
# generate page_list
137137
page_list = []
@@ -147,7 +147,7 @@ def main():
147147
for link in page_list:
148148
# print link
149149
# tr = rows // # td = cells
150-
soup = BeautifulSoup(urllib.urlopen(link))
150+
soup = BeautifulSoup(urllib.urlopen(link), "lxml")
151151
table = soup.find('table', attrs={'class':'listing'})
152152
domain = ''
153153

@@ -222,4 +222,5 @@ def main():
222222

223223
if __name__ == '__main__':
224224
main()
225+
225226
# done.

cazy_parser/extract_cazy_ids.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,18 @@
2121
import os, sys, itertools, urllib, argparse
2222
#==============================================================================#
2323

24-
def logo():
24+
def main(argv=sys.argv[1:]):
2525
print '''
26-
___ __ ____ _ _ ____ __ ____ ____ ____ ____
27-
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
28-
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
29-
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
30-
31-
A simple way to retrieve fasta sequences from CAZy Database (:
26+
___ __ ____ _ _ ____ __ ____ ____ ____ ____
27+
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
28+
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
29+
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
3230
33-
This is the accession code retrieval script.
31+
A simple way to retrieve fasta sequences from CAZy Database (:
3432
35-
'''
33+
This is the accession code retrieval script.
3634
37-
def main(argv=sys.argv[1:]):
38-
logo()
35+
'''
3936
#==============================================================================#
4037
# Options
4138
#==============================================================================#

0 commit comments

Comments
 (0)