Spaces:

taesiri
/

AIEditsSubmissions

Paused

App Files Files Community

AIEditsSubmissions / app.py

taesiri

backup

a3b292d 10 months ago

raw

history blame contribute delete

16.1 kB

	import gradio as gr
	import base64
	import json
	import os
	import shutil
	import uuid
	import glob
	from huggingface_hub import CommitScheduler, HfApi, snapshot_download
	from pathlib import Path
	import git
	from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
	import threading
	import time
	from utils import process_and_push_dataset
	from datasets import load_dataset

	api = HfApi(token=os.environ["HF_TOKEN"])

	VALID_DATASET = load_dataset("taesiri/IERv2-Subset", split="train")

	VALID_DATASET_POST_IDS = (
	load_dataset("taesiri/IERv2-Subset", split="train", columns=["post_id"])
	.to_pandas()["post_id"]
	.tolist()
	)

	POST_ID_TO_ID_MAP = {post_id: idx for idx, post_id in enumerate(VALID_DATASET_POST_IDS)}

	DATASET_REPO = "taesiri/AIImageEditingResults_Intemediate"
	FINAL_DATASET_REPO = "taesiri/AIImageEditingResults"


	# Download existing data from hub
	def sync_with_hub():
	"""
	Synchronize local data with the hub by cloning the dataset repo
	"""
	print("Starting sync with hub...")
	data_dir = Path("./data")
	if data_dir.exists():
	# Backup existing data
	backup_dir = Path("./data_backup")
	if backup_dir.exists():
	shutil.rmtree(backup_dir)
	shutil.copytree(data_dir, backup_dir)

	# Clone/pull latest data from hub
	# Use token in the URL for authentication following HF's new format
	token = os.environ["HF_TOKEN"]
	username = "taesiri" # Extract from DATASET_REPO
	repo_url = f"https://{username}:{token}@huggingface.co/datasets/{DATASET_REPO}"
	hub_data_dir = Path("hub_data")

	if hub_data_dir.exists():
	# If repo exists, do a git pull
	print("Pulling latest changes...")
	repo = git.Repo(hub_data_dir)
	origin = repo.remotes.origin
	# Set the new URL with token
	if "https://" in origin.url:
	origin.set_url(repo_url)
	origin.pull()
	else:
	# Clone the repo with token
	print("Cloning repository...")
	git.Repo.clone_from(repo_url, hub_data_dir)

	# Merge hub data with local data
	hub_data_source = hub_data_dir / "data"
	if hub_data_source.exists():
	# Create data dir if it doesn't exist
	data_dir.mkdir(exist_ok=True)

	# Copy files from hub
	for item in hub_data_source.glob("*"):
	if item.is_dir():
	dest = data_dir / item.name
	if not dest.exists(): # Only copy if doesn't exist locally
	shutil.copytree(item, dest)

	# Clean up cloned repo
	if hub_data_dir.exists():
	shutil.rmtree(hub_data_dir)
	print("Finished syncing with hub!")


	scheduler = CommitScheduler(
	repo_id=DATASET_REPO,
	repo_type="dataset",
	folder_path="./data",
	path_in_repo="data",
	every=1,
	)


	def load_question_data(question_id):
	"""
	Load a specific question's data
	Returns a tuple of all form fields
	"""
	if not question_id:
	return [None] * 11 # Reduced number of fields

	# Extract the ID part before the colon from the dropdown selection
	question_id = (
	question_id.split(":")[0].strip() if ":" in question_id else question_id
	)

	json_path = os.path.join("./data", question_id, "question.json")
	if not os.path.exists(json_path):
	print(f"Question file not found: {json_path}")
	return [None] * 11

	try:
	with open(json_path, "r", encoding="utf-8") as f:
	data = json.loads(f.read().strip())

	# Load images
	def load_image(image_path):
	if not image_path:
	return None
	full_path = os.path.join(
	"./data", question_id, os.path.basename(image_path)
	)
	return full_path if os.path.exists(full_path) else None

	question_images = data.get("question_images", [])
	rationale_images = data.get("rationale_images", [])

	return [
	(
	",".join(data["question_categories"])
	if isinstance(data["question_categories"], list)
	else data["question_categories"]
	),
	data["question"],
	data["final_answer"],
	data.get("rationale_text", ""),
	load_image(question_images[0] if question_images else None),
	load_image(question_images[1] if len(question_images) > 1 else None),
	load_image(question_images[2] if len(question_images) > 2 else None),
	load_image(question_images[3] if len(question_images) > 3 else None),
	load_image(rationale_images[0] if rationale_images else None),
	load_image(rationale_images[1] if len(rationale_images) > 1 else None),
	question_id,
	]
	except Exception as e:
	print(f"Error loading question {question_id}: {str(e)}")
	return [None] * 11


	def load_post_image(post_id):
	if not post_id:
	return [
	None
	] * 33 # source image + instruction + simplified_instruction + 10 triplets

	idx = POST_ID_TO_ID_MAP[post_id]
	source_image = VALID_DATASET[idx]["image"]
	instruction = VALID_DATASET[idx]["instruction"]
	simplified_instruction = VALID_DATASET[idx]["simplified_instruction"]

	# Load existing responses if any
	post_folder = os.path.join("./data", str(post_id))
	metadata_path = os.path.join(post_folder, "metadata.json")

	if os.path.exists(metadata_path):
	with open(metadata_path, "r") as f:
	metadata = json.load(f)

	# Initialize response data
	responses = [(None, "", "")] * 10 # Initialize with empty notes

	# Fill in existing responses
	for response in metadata["responses"]:
	idx = response["response_id"]
	if idx < 10: # Ensure we don't exceed our UI limit
	image_path = os.path.join(post_folder, response["image_path"])
	responses[idx] = (
	image_path,
	response["answer_text"],
	response.get("notes", ""),
	)

	# Flatten responses for output
	flat_responses = [item for triplet in responses for item in triplet]
	return [source_image, instruction, simplified_instruction] + flat_responses

	# If no existing responses, return source image, instructions and empty responses
	return [source_image, instruction, simplified_instruction] + [None] * 30


	def generate_json_files(source_image, responses, post_id):
	"""
	Save the source image and multiple responses to the data directory

	Args:
	source_image: Path to the source image
	responses: List of (image, answer, notes) tuples
	post_id: The post ID from the dataset
	"""
	# Create parent data folder if it doesn't exist
	parent_data_folder = "./data"
	os.makedirs(parent_data_folder, exist_ok=True)

	# Create/clear post_id folder
	post_folder = os.path.join(parent_data_folder, str(post_id))
	if os.path.exists(post_folder):
	shutil.rmtree(post_folder)
	os.makedirs(post_folder)

	# Save source image
	source_image_path = os.path.join(post_folder, "source_image.png")
	if isinstance(source_image, str):
	shutil.copy2(source_image, source_image_path)
	else:
	gr.processing_utils.save_image(source_image, source_image_path)

	# Create responses data
	responses_data = []
	for idx, (response_image, answer_text, notes) in enumerate(responses):
	if response_image and answer_text: # Only process if both image and text exist
	response_folder = os.path.join(post_folder, f"response_{idx}")
	os.makedirs(response_folder)

	# Save response image
	response_image_path = os.path.join(response_folder, "response_image.png")
	if isinstance(response_image, str):
	shutil.copy2(response_image, response_image_path)
	else:
	gr.processing_utils.save_image(response_image, response_image_path)

	# Add to responses data
	responses_data.append(
	{
	"response_id": idx,
	"answer_text": answer_text,
	"notes": notes,
	"image_path": f"response_{idx}/response_image.png",
	}
	)

	# Create metadata JSON
	metadata = {
	"post_id": post_id,
	"source_image": "source_image.png",
	"responses": responses_data,
	}

	# Save metadata
	with open(os.path.join(post_folder, "metadata.json"), "w", encoding="utf-8") as f:
	json.dump(metadata, f, ensure_ascii=False, indent=2)

	return post_folder


	def get_statistics():
	"""
	Scan the data folder and return statistics about the responses
	"""
	data_dir = Path("./data")
	if not data_dir.exists():
	return "No data directory found"

	total_expected_posts = len(VALID_DATASET_POST_IDS)
	processed_post_ids = set()
	posts_with_responses = 0
	total_responses = 0
	responses_per_post = [] # List to track number of responses for each post

	for metadata_file in data_dir.glob("*/metadata.json"):
	post_id = metadata_file.parent.name
	if post_id in VALID_DATASET_POST_IDS: # Only count valid posts
	processed_post_ids.add(post_id)
	try:
	with open(metadata_file, "r") as f:
	metadata = json.load(f)
	num_responses = len(metadata.get("responses", []))
	responses_per_post.append(num_responses)
	if num_responses > 0:
	posts_with_responses += 1
	total_responses += num_responses
	except:
	continue

	missing_posts = set(map(str, VALID_DATASET_POST_IDS)) - processed_post_ids
	total_processed = len(processed_post_ids)

	# Calculate additional statistics
	if responses_per_post:
	responses_per_post.sort()
	median_responses = responses_per_post[len(responses_per_post) // 2]
	max_responses = max(responses_per_post)
	avg_responses = (
	total_responses / posts_with_responses if posts_with_responses > 0 else 0
	)
	else:
	median_responses = max_responses = avg_responses = 0

	stats = f"""
	📊 Collection Statistics:

	Dataset Coverage:
	- Total Expected Posts: {total_expected_posts}
	- Posts Processed: {total_processed}
	- Missing Posts: {len(missing_posts)} ({', '.join(list(missing_posts)[:5])}{'...' if len(missing_posts) > 5 else ''})
	- Coverage Rate: {(total_processed/total_expected_posts*100):.2f}%

	Response Statistics:
	- Posts with Responses: {posts_with_responses}
	- Posts without Responses: {total_processed - posts_with_responses}
	- Total Individual Responses: {total_responses}

	Response Distribution:
	- Median Responses per Post: {median_responses}
	- Average Responses per Post: {avg_responses:.2f}
	- Maximum Responses for a Post: {max_responses}
	"""
	return stats


	# Build the Gradio app
	with gr.Blocks() as demo:
	gr.Markdown("# Image Response Collector")

	# Source image selection at the top
	with gr.Row():
	with gr.Column():
	post_id_dropdown = gr.Dropdown(
	label="Select Post ID to Load Image",
	choices=VALID_DATASET_POST_IDS,
	type="value",
	allow_custom_value=False,
	)
	instruction_text = gr.Textbox(label="Instruction", interactive=False)
	simplified_instruction_text = gr.Textbox(
	label="Simplified Instruction", interactive=False
	)
	source_image = gr.Image(label="Source Image", type="filepath", height=300)

	# Responses in tabs
	with gr.Tabs() as response_tabs:
	responses = []
	for i in range(10):
	with gr.Tab(f"Response {i+1}"):
	img = gr.Image(
	label=f"Response Image {i+1}", type="filepath", height=300
	)
	txt = gr.Textbox(label=f"Model Name {i+1}", lines=2)
	notes = gr.Textbox(label=f"Miscellaneous Notes {i+1}", lines=3)
	responses.append((img, txt, notes))

	with gr.Row():
	submit_btn = gr.Button("Submit All Responses")
	clear_btn = gr.Button("Clear Form")

	# Add statistics accordion
	with gr.Accordion("Collection Statistics", open=False):
	stats_text = gr.Markdown("Loading statistics...")
	refresh_stats_btn = gr.Button("Refresh Statistics")

	def update_stats():
	return get_statistics()

	refresh_stats_btn.click(fn=update_stats, outputs=[stats_text])

	# Move the load event inside the Blocks context
	demo.load(
	fn=get_statistics,
	outputs=[stats_text],
	)

	def submit_responses(
	source_img, post_id, instruction, simplified_instruction, *response_data
	):
	if not source_img:
	gr.Warning("Please select a source image first!")
	return

	if not post_id:
	gr.Warning("Please select a post ID first!")
	return

	# Convert flat response_data into triplets of (image, text, notes)
	response_triplets = list(
	zip(response_data[::3], response_data[1::3], response_data[2::3])
	)

	# Check for responses with images but no model names
	incomplete_responses = [
	i + 1
	for i, (img, txt, _) in enumerate(response_triplets)
	if img is not None and not txt.strip()
	]

	if incomplete_responses:
	gr.Warning(
	f"Please provide model names for responses: {', '.join(map(str, incomplete_responses))}!"
	)
	return

	# Filter out empty responses (where both image and model name are empty)
	valid_responses = [
	(img, txt, notes)
	for img, txt, notes in response_triplets
	if img is not None and txt.strip()
	]

	if not valid_responses:
	gr.Warning("Please provide at least one response (image + model name)!")
	return

	# Generate JSON files with the valid responses
	generate_json_files(source_img, valid_responses, post_id)
	gr.Info("Responses saved successfully! 🎉")

	def clear_form():
	outputs = [None] * (
	1 + 2 + 30
	) # source image + 2 instruction fields + 10 triplets
	return outputs

	# Connect components
	post_id_dropdown.change(
	fn=load_post_image,
	inputs=[post_id_dropdown],
	outputs=[source_image, instruction_text, simplified_instruction_text]
	+ [comp for triplet in responses for comp in triplet],
	)

	submit_inputs = [
	source_image,
	post_id_dropdown,
	instruction_text,
	simplified_instruction_text,
	] + [comp for triplet in responses for comp in triplet]
	submit_btn.click(fn=submit_responses, inputs=submit_inputs)

	clear_outputs = [source_image, instruction_text, simplified_instruction_text] + [
	comp for triplet in responses for comp in triplet
	]
	clear_btn.click(fn=clear_form, outputs=clear_outputs)


	def process_thread():
	while True:
	try:
	pass
	# process_and_push_dataset(
	# "./data",
	# FINAL_DATASET_REPO,
	# token=os.environ["HF_TOKEN"],
	# private=True,
	# )
	except Exception as e:
	print(f"Error in process thread: {e}")
	time.sleep(120) # Sleep for 2 minutes


	if __name__ == "__main__":
	print("Initializing app...")
	sync_with_hub() # Sync before launching the app
	print("Starting Gradio interface...")

	# Start the processing thread when the app starts
	processing_thread = threading.Thread(target=process_thread, daemon=True)
	processing_thread.start()

	demo.launch()