Create fetch_papers.py

easilylazy · web-flow · commit 5b4931c52338 · 2021-09-29T09:54:23.000+08:00
diff --git a/fetch_papers.py b/fetch_papers.py
@@ -0,0 +1,40 @@
+'''
+Author: doodhwala, leezeeyee
+Python3 script to fetch papers
+'''
+
+import os, re, requests, codecs
+
+filename='README.md'
+directory = 'papers'
+if not os.path.exists(directory):
+    os.makedirs(directory)
+papers = []
+with codecs.open(filename, encoding='utf-8', mode='r', buffering=1, errors='strict') as f:
+    lines = f.read().split('\n')
+    heading, section_path = '', ''
+    for line in lines:
+        if('## 20' in line):
+            heading = line.strip().split('##')[1]
+            win_restricted_chars = re.compile(r'[\^\/\\\:\*\?\"<>\|]')
+            heading = win_restricted_chars.sub("", heading)
+            section_path = os.path.join(directory, heading)
+            if not os.path.exists(section_path):
+                os.makedirs(section_path)
+        if('[`[pdf]`]' in line):
+            # The stars ensure you pick up only the top 100 papers
+            # Modify the expression if you want to fetch all other papers as well
+            result = re.search('(.*?)\[`\[pdf\]`\]\((.*?)\)', line)
+            if(result):
+                paper, url = result.groups()
+                paper = win_restricted_chars.sub("", paper)
+                paper=paper.strip('- ')
+                # Auto - resume functionality
+                if(not os.path.exists(os.path.join(section_path, paper + '.pdf'))):
+                    print('Fetching', paper)
+                    try:
+                        response = requests.get(url)
+                        with open(os.path.join(section_path, paper + '.pdf'), 'wb') as f:
+                            f.write(response.content)
+                    except requests.exceptions.RequestException as e:
+                        print("Error: {}".format(e))