65 lines
1.8 KiB
Python
65 lines
1.8 KiB
Python
import os
|
|
import requests
|
|
import fnmatch
|
|
|
|
# Load environment variables
|
|
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
|
|
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
|
|
|
|
# Ensure that the token is available
|
|
if not PAPERLESS_TOKEN:
|
|
raise ValueError("Paperless token is not set")
|
|
|
|
if not PAPERLESS_BASE_URL:
|
|
raise ValueError("Paperless base URL is not set")
|
|
|
|
# Set search parameters
|
|
search_params = ["Scan*", "PDF*"]
|
|
|
|
|
|
# Function to get all documents from Paperless with pagination
|
|
def get_all_documents():
|
|
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
|
|
url = f"{PAPERLESS_BASE_URL}/documents/"
|
|
|
|
documents = []
|
|
while url:
|
|
response = requests.get(url, headers=headers)
|
|
data = response.json()
|
|
documents.extend(data.get("results", []))
|
|
url = data.get("next")
|
|
return documents
|
|
|
|
|
|
# Function to filter documents based on search parameters
|
|
def filter_documents(documents, search_params):
|
|
filtered_docs = []
|
|
for doc in documents:
|
|
for pattern in search_params:
|
|
if fnmatch.fnmatch(doc["title"], pattern):
|
|
filtered_docs.append(doc)
|
|
break
|
|
return filtered_docs
|
|
|
|
|
|
def save_document_content(file_name, content):
|
|
os.makedirs("content", exist_ok=True) # Ensure the content directory exists
|
|
with open(f"content/{file_name}.txt", "w", encoding="utf-8") as file:
|
|
file.write(content)
|
|
|
|
|
|
def main():
|
|
all_documents = get_all_documents()
|
|
filtered_documents = filter_documents(all_documents, search_params)
|
|
|
|
for doc in filtered_documents:
|
|
# Use the 'content' field directly
|
|
doc_content = doc.get("content", "")
|
|
save_document_content(doc["title"], doc_content)
|
|
print(
|
|
f"Saved content for Document ID: {doc['id']} to content/{doc['title']}.txt"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|