paperless-ai-renaming/main.py
2025-02-20 21:09:33 -05:00

134 lines
4.2 KiB
Python

import os
import requests
import re
import fnmatch
from openai import OpenAI
# Load environment variables
CHATGPT_TOKEN = os.getenv("CHATGPT_TOKEN")
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
client = OpenAI(api_key=CHATGPT_TOKEN)
# Ensure that tokens are available
if not CHATGPT_TOKEN or not PAPERLESS_TOKEN:
raise ValueError("Environment variables for tokens are not set")
# Set search parameters to get all documents
search_params = ["*"]
# Maximum number of retries for renaming a failed document
MAX_RETRIES = 3
def get_all_documents():
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
url = f"{PAPERLESS_BASE_URL}/documents/"
documents = []
while url:
response = requests.get(url, headers=headers)
data = response.json()
for doc in data.get("results", []):
documents.append(
{
"id": doc["id"],
"title": doc["title"],
"content": doc.get("content", ""),
}
)
url = data.get("next")
return documents
def filter_documents(documents, search_params):
filtered_docs = []
for doc in documents:
for pattern in search_params:
if fnmatch.fnmatch(doc["title"], pattern):
filtered_docs.append(doc)
break
return filtered_docs
def sanitize_filename(name):
# Remove invalid characters and replace spaces with underscores
return re.sub(r'[\\/*?:"<>|]', "", name)
def generate_pdf_name(ocr_content):
formatted_content = ocr_content.replace("\n", " ")
try:
# Prompt for generating a PDF title based on OCR content
prompt = f"""
Please suggest a descriptive and unique document title.
Spaces are preferred over underscores.
First, correct any spelling mistakes in the following content, then suggest the title: {formatted_content}
"""
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[
{
"role": "system",
"content": prompt,
},
{"role": "user", "content": ""},
],
temperature=1,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
)
if response and response.choices:
suggested_name = response.choices[0].message.content.strip()
if "unable to suggest" not in suggested_name:
return sanitize_filename(suggested_name)
else:
return "Unable_To_Suggest_Title"
else:
return "No_Response"
except Exception as e:
print(f"An error occurred: {e}")
return "Error_Generated_Title"
def rename_pdf(document_id, new_name):
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
data = {"title": new_name}
response = requests.patch(
f"{PAPERLESS_BASE_URL}/documents/{document_id}/", headers=headers, data=data
)
return response.ok
def main():
documents = get_all_documents()
filtered_documents = filter_documents(documents, search_params)
# Create a dictionary to store the number of retries for each document
retry_counts = {doc["id"]: 0 for doc in filtered_documents}
for doc in filtered_documents:
# Retry renaming a failed document up to MAX_RETRIES times
while retry_counts[doc["id"]] < MAX_RETRIES:
ocr_content = doc["content"]
new_name = generate_pdf_name(ocr_content)
if new_name and rename_pdf(doc["id"], new_name):
print(f"Renamed document {doc['id']} to {new_name}")
break # Rename successful, move to the next document
else:
retry_counts[doc["id"]] += 1
print(f"Retry {retry_counts[doc['id']]} for document {doc['id']}")
# Log failed documents
if retry_counts[doc["id"]] == MAX_RETRIES:
with open("error.log", "a") as error_log:
error_log.write(
f"Failed to rename document {doc['id']} after {MAX_RETRIES} retries\n"
)
if __name__ == "__main__":
main()