paperless-ai-renaming/test-paperless.py

import os
import requests
import fnmatch

# Load environment variables
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")

# Ensure that the token is available
if not PAPERLESS_TOKEN:
    raise ValueError("Paperless token is not set")

if not PAPERLESS_BASE_URL:
    raise ValueError("Paperless base URL is not set")

# Set search parameters
search_params = ["Scan*", "PDF*"]


# Function to get all documents from Paperless with pagination
def get_all_documents():
    headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
    url = f"{PAPERLESS_BASE_URL}/documents/"

    documents = []
    while url:
        response = requests.get(url, headers=headers)
        data = response.json()
        documents.extend(data.get("results", []))
        url = data.get("next")
    return documents


# Function to filter documents based on search parameters
def filter_documents(documents, search_params):
    filtered_docs = []
    for doc in documents:
        for pattern in search_params:
            if fnmatch.fnmatch(doc["title"], pattern):
                filtered_docs.append(doc)
                break
    return filtered_docs


def save_document_content(file_name, content):
    os.makedirs("content", exist_ok=True)  # Ensure the content directory exists
    with open(f"content/{file_name}.txt", "w", encoding="utf-8") as file:
        file.write(content)


def main():
    all_documents = get_all_documents()
    filtered_documents = filter_documents(all_documents, search_params)

    for doc in filtered_documents:
        # Use the 'content' field directly
        doc_content = doc.get("content", "")
        save_document_content(doc["title"], doc_content)
        print(
            f"Saved content for Document ID: {doc['id']} to content/{doc['title']}.txt"
        )


if __name__ == "__main__":
    main()