134 lines
4.2 KiB
Python
134 lines
4.2 KiB
Python
import os
|
|
import requests
|
|
import re
|
|
import fnmatch
|
|
from openai import OpenAI
|
|
|
|
# Load environment variables
|
|
CHATGPT_TOKEN = os.getenv("CHATGPT_TOKEN")
|
|
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
|
|
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
|
|
client = OpenAI(api_key=CHATGPT_TOKEN)
|
|
|
|
# Ensure that tokens are available
|
|
if not CHATGPT_TOKEN or not PAPERLESS_TOKEN:
|
|
raise ValueError("Environment variables for tokens are not set")
|
|
|
|
# Set search parameters to get all documents
|
|
search_params = ["*"]
|
|
|
|
# Maximum number of retries for renaming a failed document
|
|
MAX_RETRIES = 3
|
|
|
|
|
|
def get_all_documents():
|
|
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
|
|
url = f"{PAPERLESS_BASE_URL}/documents/"
|
|
documents = []
|
|
while url:
|
|
response = requests.get(url, headers=headers)
|
|
data = response.json()
|
|
for doc in data.get("results", []):
|
|
documents.append(
|
|
{
|
|
"id": doc["id"],
|
|
"title": doc["title"],
|
|
"content": doc.get("content", ""),
|
|
}
|
|
)
|
|
url = data.get("next")
|
|
return documents
|
|
|
|
|
|
def filter_documents(documents, search_params):
|
|
filtered_docs = []
|
|
for doc in documents:
|
|
for pattern in search_params:
|
|
if fnmatch.fnmatch(doc["title"], pattern):
|
|
filtered_docs.append(doc)
|
|
break
|
|
return filtered_docs
|
|
|
|
|
|
def sanitize_filename(name):
|
|
# Remove invalid characters and replace spaces with underscores
|
|
return re.sub(r'[\\/*?:"<>|]', "", name)
|
|
|
|
|
|
def generate_pdf_name(ocr_content):
|
|
formatted_content = ocr_content.replace("\n", " ")
|
|
try:
|
|
# Prompt for generating a PDF title based on OCR content
|
|
prompt = f"""
|
|
Please suggest a descriptive and unique document title.
|
|
Spaces are preferred over underscores.
|
|
First, correct any spelling mistakes in the following content, then suggest the title: {formatted_content}
|
|
"""
|
|
|
|
response = client.chat.completions.create(
|
|
model="gpt-4-turbo-preview",
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": prompt,
|
|
},
|
|
{"role": "user", "content": ""},
|
|
],
|
|
temperature=1,
|
|
max_tokens=256,
|
|
top_p=1,
|
|
frequency_penalty=0,
|
|
presence_penalty=0,
|
|
)
|
|
if response and response.choices:
|
|
suggested_name = response.choices[0].message.content.strip()
|
|
if "unable to suggest" not in suggested_name:
|
|
return sanitize_filename(suggested_name)
|
|
else:
|
|
return "Unable_To_Suggest_Title"
|
|
else:
|
|
return "No_Response"
|
|
except Exception as e:
|
|
print(f"An error occurred: {e}")
|
|
return "Error_Generated_Title"
|
|
|
|
|
|
def rename_pdf(document_id, new_name):
|
|
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
|
|
data = {"title": new_name}
|
|
response = requests.patch(
|
|
f"{PAPERLESS_BASE_URL}/documents/{document_id}/", headers=headers, data=data
|
|
)
|
|
return response.ok
|
|
|
|
|
|
def main():
|
|
documents = get_all_documents()
|
|
filtered_documents = filter_documents(documents, search_params)
|
|
|
|
# Create a dictionary to store the number of retries for each document
|
|
retry_counts = {doc["id"]: 0 for doc in filtered_documents}
|
|
|
|
for doc in filtered_documents:
|
|
# Retry renaming a failed document up to MAX_RETRIES times
|
|
while retry_counts[doc["id"]] < MAX_RETRIES:
|
|
ocr_content = doc["content"]
|
|
new_name = generate_pdf_name(ocr_content)
|
|
|
|
if new_name and rename_pdf(doc["id"], new_name):
|
|
print(f"Renamed document {doc['id']} to {new_name}")
|
|
break # Rename successful, move to the next document
|
|
else:
|
|
retry_counts[doc["id"]] += 1
|
|
print(f"Retry {retry_counts[doc['id']]} for document {doc['id']}")
|
|
|
|
# Log failed documents
|
|
if retry_counts[doc["id"]] == MAX_RETRIES:
|
|
with open("error.log", "a") as error_log:
|
|
error_log.write(
|
|
f"Failed to rename document {doc['id']} after {MAX_RETRIES} retries\n"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|