Initial Commit

This commit is contained in:
Ronnie 2025-02-20 21:09:33 -05:00
commit 27de462bc3
7 changed files with 392 additions and 0 deletions

65
test-paperless.py Normal file
View file

@ -0,0 +1,65 @@
import os
import requests
import fnmatch
# Load environment variables
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
# Ensure that the token is available
if not PAPERLESS_TOKEN:
raise ValueError("Paperless token is not set")
if not PAPERLESS_BASE_URL:
raise ValueError("Paperless base URL is not set")
# Set search parameters
search_params = ["Scan*", "PDF*"]
# Function to get all documents from Paperless with pagination
def get_all_documents():
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
url = f"{PAPERLESS_BASE_URL}/documents/"
documents = []
while url:
response = requests.get(url, headers=headers)
data = response.json()
documents.extend(data.get("results", []))
url = data.get("next")
return documents
# Function to filter documents based on search parameters
def filter_documents(documents, search_params):
filtered_docs = []
for doc in documents:
for pattern in search_params:
if fnmatch.fnmatch(doc["title"], pattern):
filtered_docs.append(doc)
break
return filtered_docs
def save_document_content(file_name, content):
os.makedirs("content", exist_ok=True) # Ensure the content directory exists
with open(f"content/{file_name}.txt", "w", encoding="utf-8") as file:
file.write(content)
def main():
all_documents = get_all_documents()
filtered_documents = filter_documents(all_documents, search_params)
for doc in filtered_documents:
# Use the 'content' field directly
doc_content = doc.get("content", "")
save_document_content(doc["title"], doc_content)
print(
f"Saved content for Document ID: {doc['id']} to content/{doc['title']}.txt"
)
if __name__ == "__main__":
main()