|
| 1 | +''' |
| 2 | +Author: doodhwala, leezeeyee |
| 3 | +Python3 script to fetch papers |
| 4 | +''' |
| 5 | + |
| 6 | +import os, re, requests, codecs |
| 7 | + |
| 8 | +filename='README.md' |
| 9 | +directory = 'papers' |
| 10 | +if not os.path.exists(directory): |
| 11 | + os.makedirs(directory) |
| 12 | +papers = [] |
| 13 | +with codecs.open(filename, encoding='utf-8', mode='r', buffering=1, errors='strict') as f: |
| 14 | + lines = f.read().split('\n') |
| 15 | + heading, section_path = '', '' |
| 16 | + for line in lines: |
| 17 | + if('## 20' in line): |
| 18 | + heading = line.strip().split('##')[1] |
| 19 | + win_restricted_chars = re.compile(r'[\^\/\\\:\*\?\"<>\|]') |
| 20 | + heading = win_restricted_chars.sub("", heading) |
| 21 | + section_path = os.path.join(directory, heading) |
| 22 | + if not os.path.exists(section_path): |
| 23 | + os.makedirs(section_path) |
| 24 | + if('[`[pdf]`]' in line): |
| 25 | + # The stars ensure you pick up only the top 100 papers |
| 26 | + # Modify the expression if you want to fetch all other papers as well |
| 27 | + result = re.search('(.*?)\[`\[pdf\]`\]\((.*?)\)', line) |
| 28 | + if(result): |
| 29 | + paper, url = result.groups() |
| 30 | + paper = win_restricted_chars.sub("", paper) |
| 31 | + paper=paper.strip('- ') |
| 32 | + # Auto - resume functionality |
| 33 | + if(not os.path.exists(os.path.join(section_path, paper + '.pdf'))): |
| 34 | + print('Fetching', paper) |
| 35 | + try: |
| 36 | + response = requests.get(url) |
| 37 | + with open(os.path.join(section_path, paper + '.pdf'), 'wb') as f: |
| 38 | + f.write(response.content) |
| 39 | + except requests.exceptions.RequestException as e: |
| 40 | + print("Error: {}".format(e)) |
0 commit comments