langchain-ai · deanchanter · Apr 29, 2025
diff --git a/mcpdoc/main.py b/mcpdoc/main.py
@@ -7,7 +7,7 @@
 from markdownify import markdownify
 from mcp.server.fastmcp import FastMCP
 from typing_extensions import NotRequired, TypedDict
-
+from pypdf import PdfReader
 
 class DocSource(TypedDict):
     """A source of documentation for a library or a package."""
@@ -21,6 +21,17 @@ class DocSource(TypedDict):
     description: NotRequired[str]
     """Description of the documentation source (optional)."""
 
+def extract_text_from_pdf(pdf_file) -> str:
+    """Extract text content from a PDF file."""
+    try:
+        reader = PdfReader(pdf_file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+        return text.strip()
+    except Exception as e:
+        raise ValueError(f"Failed to extract text from PDF: {str(e)}")
+
 
 def extract_domain(url: str) -> str:
     """Extract domain from URL.
@@ -213,7 +224,14 @@ async def fetch_docs(url: str) -> str:
             try:
                 response = await httpx_client.get(url, timeout=timeout)
                 response.raise_for_status()
-                return markdownify(response.text)
+                if url.endswith(".txt"):
+                    return  response.text
+                elif url.endswith(".md"):
+                    return markdownify(response.text)
+                elif url.endswith(".pdf"):
+                    return extract_text_from_pdf(io.BytesIO(response.content))
+                else:
+                    return markdownify(response.text)
             except (httpx.HTTPStatusError, httpx.RequestError) as e:
                 return f"Encountered an HTTP error: {str(e)}"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "httpx>=0.28.1",
     "markdownify>=1.1.0",
     "mcp[cli]>=1.4.1",
+    "pypdf>=5.4.0",
     "pyyaml>=6.0.1",
 ]
 

diff --git a/uv.lock b/uv.lock