Initial Commit

2025-02-20 21:09:33 -05:00 · 2025-02-20 21:09:33 -05:00 · 27de462bc3
commit 27de462bc3
7 changed files with 392 additions and 0 deletions
--- a/test-paperless.py
+++ b/test-paperless.py
@ -0,0 +1,65 @@
+import os
+import requests
+import fnmatch
+
+# Load environment variables
+PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
+PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
+
+# Ensure that the token is available
+if not PAPERLESS_TOKEN:
+    raise ValueError("Paperless token is not set")
+
+if not PAPERLESS_BASE_URL:
+    raise ValueError("Paperless base URL is not set")
+
+# Set search parameters
+search_params = ["Scan*", "PDF*"]
+
+
+# Function to get all documents from Paperless with pagination
+def get_all_documents():
+    headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
+    url = f"{PAPERLESS_BASE_URL}/documents/"
+
+    documents = []
+    while url:
+        response = requests.get(url, headers=headers)
+        data = response.json()
+        documents.extend(data.get("results", []))
+        url = data.get("next")
+    return documents
+
+
+# Function to filter documents based on search parameters
+def filter_documents(documents, search_params):
+    filtered_docs = []
+    for doc in documents:
+        for pattern in search_params:
+            if fnmatch.fnmatch(doc["title"], pattern):
+                filtered_docs.append(doc)
+                break
+    return filtered_docs
+
+
+def save_document_content(file_name, content):
+    os.makedirs("content", exist_ok=True)  # Ensure the content directory exists
+    with open(f"content/{file_name}.txt", "w", encoding="utf-8") as file:
+        file.write(content)
+
+
+def main():
+    all_documents = get_all_documents()
+    filtered_documents = filter_documents(all_documents, search_params)
+
+    for doc in filtered_documents:
+        # Use the 'content' field directly
+        doc_content = doc.get("content", "")
+        save_document_content(doc["title"], doc_content)
+        print(
+            f"Saved content for Document ID: {doc['id']} to content/{doc['title']}.txt"
+        )
+
+
+if __name__ == "__main__":
+    main()