Spaces:

chandanzeon
/

DoclingAPI

Sleeping

App Files Files Community

DoclingAPI / app.py

chandanzeon

First Commit

329ee91 11 months ago

raw

history blame contribute delete

3.18 kB

	from fastapi import FastAPI, HTTPException
	from dotenv import load_dotenv
	import boto3
	import os
	import uvicorn
	import logging
	from uuid import uuid4
	from pydantic import BaseModel
	from helper import PdfToSectionConverter

	# Load environment variables
	load_dotenv()

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Fetch AWS credentials from environment
	s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID")
	s3_secret_key = os.getenv("S3_SECRET_KEY")
	aws_region = os.getenv("AWS_REGION")

	# Validate environment variables
	if not all([s3_access_key_id, s3_secret_key, aws_region]):
	logger.error("Missing AWS S3 credentials in environment variables.")
	raise ValueError("AWS credentials not set properly.")

	# Initialize FastAPI app
	app = FastAPI()

	# Configure S3 client
	s3_client = boto3.client(
	"s3",
	aws_access_key_id=s3_access_key_id,
	aws_secret_access_key=s3_secret_key,
	region_name=aws_region,
	)

	class PdfRequest(BaseModel):
	s3_file_path: str
	file_title: str
	doc_id : str
	start_page: int = 0
	end_page: int = 0

	@app.get("/")
	async def start():
	return {"message": "Parser API is Ready"}

	@app.post("/convert_pdf")
	async def convert_pdf(request: PdfRequest):
	try:
	output_dir = "/tmp"
	output_path = os.path.join(output_dir, "temp_file.pdf")
	doc_id = request.doc_id

	# Ensure the directory exists
	if not os.path.exists(output_dir):
	os.makedirs(output_dir, exist_ok=True)

	# Validate S3 file path
	if not request.s3_file_path.startswith("s3://"):
	raise HTTPException(status_code=400, detail="Invalid S3 file path. Must start with 's3://'")

	try:
	bucket_name, object_key = request.s3_file_path.replace("s3://", "").split("/", 1)
	except ValueError:
	raise HTTPException(status_code=400, detail="Invalid S3 file path format.")

	logger.info(f"Downloading {request.s3_file_path} from S3 bucket {bucket_name}...")

	# Download PDF from S3
	try:
	s3_client.download_file(bucket_name, object_key, output_path)
	except Exception as e:
	logger.error(f"Failed to download file from S3: {str(e)}")
	raise HTTPException(status_code=500, detail="Error downloading file from S3.")

	# Initialize and run the converter
	converter = PdfToSectionConverter()
	output = converter.convert(
	downloaded_pdf_path=output_path,
	file_title=request.file_title,
	doc_id=doc_id,
	start_page_no=request.start_page,
	end_page_no=request.end_page
	)

	# Cleanup the temporary file
	os.remove(output_path)

	return {"status": "success", "data": output}

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Unexpected error: {str(e)}")
	raise HTTPException(status_code=500, detail="Internal Server Error.")

	def start_server():
	logger.info("Starting Server...")
	uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)

	if __name__ == "__main__":
	start_server()