Initial Commit
This commit is contained in:
commit
27de462bc3
7 changed files with 392 additions and 0 deletions
134
main.py
Normal file
134
main.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
import os
|
||||
import requests
|
||||
import re
|
||||
import fnmatch
|
||||
from openai import OpenAI
|
||||
|
||||
# Load environment variables
|
||||
CHATGPT_TOKEN = os.getenv("CHATGPT_TOKEN")
|
||||
PAPERLESS_TOKEN = os.getenv("PAPERLESS_TOKEN")
|
||||
PAPERLESS_BASE_URL = os.getenv("PAPERLESS_BASE_URL")
|
||||
client = OpenAI(api_key=CHATGPT_TOKEN)
|
||||
|
||||
# Ensure that tokens are available
|
||||
if not CHATGPT_TOKEN or not PAPERLESS_TOKEN:
|
||||
raise ValueError("Environment variables for tokens are not set")
|
||||
|
||||
# Set search parameters to get all documents
|
||||
search_params = ["*"]
|
||||
|
||||
# Maximum number of retries for renaming a failed document
|
||||
MAX_RETRIES = 3
|
||||
|
||||
|
||||
def get_all_documents():
|
||||
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
|
||||
url = f"{PAPERLESS_BASE_URL}/documents/"
|
||||
documents = []
|
||||
while url:
|
||||
response = requests.get(url, headers=headers)
|
||||
data = response.json()
|
||||
for doc in data.get("results", []):
|
||||
documents.append(
|
||||
{
|
||||
"id": doc["id"],
|
||||
"title": doc["title"],
|
||||
"content": doc.get("content", ""),
|
||||
}
|
||||
)
|
||||
url = data.get("next")
|
||||
return documents
|
||||
|
||||
|
||||
def filter_documents(documents, search_params):
|
||||
filtered_docs = []
|
||||
for doc in documents:
|
||||
for pattern in search_params:
|
||||
if fnmatch.fnmatch(doc["title"], pattern):
|
||||
filtered_docs.append(doc)
|
||||
break
|
||||
return filtered_docs
|
||||
|
||||
|
||||
def sanitize_filename(name):
|
||||
# Remove invalid characters and replace spaces with underscores
|
||||
return re.sub(r'[\\/*?:"<>|]', "", name)
|
||||
|
||||
|
||||
def generate_pdf_name(ocr_content):
|
||||
formatted_content = ocr_content.replace("\n", " ")
|
||||
try:
|
||||
# Prompt for generating a PDF title based on OCR content
|
||||
prompt = f"""
|
||||
Please suggest a descriptive and unique document title.
|
||||
Spaces are preferred over underscores.
|
||||
First, correct any spelling mistakes in the following content, then suggest the title: {formatted_content}
|
||||
"""
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4-turbo-preview",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": prompt,
|
||||
},
|
||||
{"role": "user", "content": ""},
|
||||
],
|
||||
temperature=1,
|
||||
max_tokens=256,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
)
|
||||
if response and response.choices:
|
||||
suggested_name = response.choices[0].message.content.strip()
|
||||
if "unable to suggest" not in suggested_name:
|
||||
return sanitize_filename(suggested_name)
|
||||
else:
|
||||
return "Unable_To_Suggest_Title"
|
||||
else:
|
||||
return "No_Response"
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return "Error_Generated_Title"
|
||||
|
||||
|
||||
def rename_pdf(document_id, new_name):
|
||||
headers = {"Authorization": f"Token {PAPERLESS_TOKEN}"}
|
||||
data = {"title": new_name}
|
||||
response = requests.patch(
|
||||
f"{PAPERLESS_BASE_URL}/documents/{document_id}/", headers=headers, data=data
|
||||
)
|
||||
return response.ok
|
||||
|
||||
|
||||
def main():
|
||||
documents = get_all_documents()
|
||||
filtered_documents = filter_documents(documents, search_params)
|
||||
|
||||
# Create a dictionary to store the number of retries for each document
|
||||
retry_counts = {doc["id"]: 0 for doc in filtered_documents}
|
||||
|
||||
for doc in filtered_documents:
|
||||
# Retry renaming a failed document up to MAX_RETRIES times
|
||||
while retry_counts[doc["id"]] < MAX_RETRIES:
|
||||
ocr_content = doc["content"]
|
||||
new_name = generate_pdf_name(ocr_content)
|
||||
|
||||
if new_name and rename_pdf(doc["id"], new_name):
|
||||
print(f"Renamed document {doc['id']} to {new_name}")
|
||||
break # Rename successful, move to the next document
|
||||
else:
|
||||
retry_counts[doc["id"]] += 1
|
||||
print(f"Retry {retry_counts[doc['id']]} for document {doc['id']}")
|
||||
|
||||
# Log failed documents
|
||||
if retry_counts[doc["id"]] == MAX_RETRIES:
|
||||
with open("error.log", "a") as error_log:
|
||||
error_log.write(
|
||||
f"Failed to rename document {doc['id']} after {MAX_RETRIES} retries\n"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue