Spaces:
Sleeping
Sleeping
File size: 11,895 Bytes
ca2c89c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import time
from utils.model_loader import load_text_generator
from utils.helpers import fig_to_html, df_to_html_table
def text_generation_handler(text_input, max_length=100, temperature=0.7, top_p=0.9, num_sequences=1):
"""Show text generation capabilities."""
output_html = []
# Add result area container
output_html.append('<div class="result-area">')
output_html.append('<h2 class="task-header">Text Generation</h2>')
output_html.append("""
<div class="alert alert-info">
<i class="fas fa-info-circle"></i>
Text generation models can continue or expand on a given text prompt, creating new content that follows the style and context of the input.
</div>
""")
# Model info
output_html.append("""
<div class="alert alert-info">
<h4><i class="fas fa-tools"></i> Model Used:</h4>
<ul>
<li><b>GPT-2</b> - 124M parameter language model trained on a diverse corpus of internet text</li>
<li><b>Capabilities</b> - Can generate coherent text continuations and completions</li>
<li><b>Limitations</b> - May occasionally produce repetitive or nonsensical content</li>
</ul>
</div>
""")
try:
# Check text length and possibly truncate
MAX_PROMPT_LENGTH = 100 # tokens
# Count tokens (rough approximation)
token_count = len(text_input.split())
# Truncate if necessary
if token_count > MAX_PROMPT_LENGTH:
prompt_text = " ".join(text_input.split()[:MAX_PROMPT_LENGTH])
output_html.append("""
<div class="alert alert-warning">
<p class="mb-0">⚠️ Text truncated to approximately 100 tokens for better generation results.</p>
</div>
""")
else:
prompt_text = text_input
# Display prompt
output_html.append('<h3 class="task-subheader">Prompt</h3>')
output_html.append(f'<div class="card"><div class="card-body">{prompt_text}</div></div>')
# Load model
text_generator = load_text_generator()
# Set up generation parameters
generation_kwargs = {
"max_length": token_count + max_length,
"num_return_sequences": num_sequences,
"temperature": temperature,
"top_p": top_p,
"do_sample": True,
"no_repeat_ngram_size": 2,
"pad_token_id": 50256 # GPT-2's pad token ID
}
# Generate text
start_time = time.time()
result = text_generator(prompt_text, **generation_kwargs)
generation_time = time.time() - start_time
# Display results
output_html.append('<h3 class="task-subheader">Generated Text</h3>')
for i, sequence in enumerate(result):
generated_text = sequence['generated_text']
new_text = generated_text[len(prompt_text):]
# Display in a nice format with the prompt and generated text distinguished
if num_sequences > 1:
output_html.append(f'<h4>Version {i+1}</h4>')
output_html.append(f"""
<div class="card">
<div class="card-body">
<span class="text-muted">{prompt_text}</span>
<span class="text-primary fw-bold">{new_text}</span>
</div>
</div>
""")
# Generation stats for this sequence
prompt_tokens = len(prompt_text.split())
gen_tokens = len(new_text.split())
# Calculate average word length as a crude complexity metric
avg_word_len = sum(len(word) for word in new_text.split()) / max(1, len(new_text.split()))
output_html.append(f"""
<div class="alert alert-success">
<h4 class="mb-3">Generation Statistics</h4>
<div class="row">
<div class="col-md-6">
<p><b>Prompt length:</b> {prompt_tokens} tokens</p>
<p><b>Generated length:</b> {gen_tokens} tokens</p>
<p><b>Total length:</b> {prompt_tokens + gen_tokens} tokens</p>
</div>
<div class="col-md-6">
<p><b>Temperature:</b> {temperature}</p>
<p><b>Top-p:</b> {top_p}</p>
<p><b>Avg word length:</b> {avg_word_len:.2f} characters</p>
</div>
</div>
<p><b>Generation time:</b> {generation_time:.2f} seconds</p>
</div>
""")
# Option to see full text
output_html.append(f"""
<div class="card">
<div class="card-header">
<h5 class="mb-0">
<button class="btn btn-link" type="button" data-bs-toggle="collapse" data-bs-target="#fullText{i}" aria-expanded="false">
Show full text (copy-paste friendly)
</button>
</h5>
</div>
<div class="collapse" id="fullText{i}">
<div class="card-body">
<div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 500px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{generated_text}</div>
</div>
</div>
</div>
""")
# Generate a text complexity analysis
if len(result) > 0:
output_html.append('<h3 class="task-subheader">Text Analysis</h3>')
# Get the first generated text for analysis
full_text = result[0]['generated_text']
prompt_words = prompt_text.split()
full_words = full_text.split()
generated_words = full_words[len(prompt_words):]
# Analyze word length distribution
prompt_word_lengths = [len(word) for word in prompt_words]
generated_word_lengths = [len(word) for word in generated_words]
# Create comparison chart
fig, ax = plt.subplots(figsize=(10, 5))
# Plot histograms
bins = range(1, 16) # Word lengths from 1 to 15
ax.hist(prompt_word_lengths, bins=bins, alpha=0.7, label='Prompt', color='#1976D2')
ax.hist(generated_word_lengths, bins=bins, alpha=0.7, label='Generated', color='#4CAF50')
ax.set_xlabel('Word Length (characters)')
ax.set_ylabel('Frequency')
ax.set_title('Word Length Distribution: Prompt vs Generated')
ax.legend()
ax.grid(alpha=0.3)
output_html.append(fig_to_html(fig))
# Calculate some linguistic statistics
prompt_avg_word_len = sum(prompt_word_lengths) / len(prompt_word_lengths) if prompt_word_lengths else 0
generated_avg_word_len = sum(generated_word_lengths) / len(generated_word_lengths) if generated_word_lengths else 0
# Create comparison table
stats_data = {
'Metric': ['Word count', 'Average word length', 'Unique words', 'Lexical diversity*'],
'Prompt': [
len(prompt_words),
f"{prompt_avg_word_len:.2f}",
len(set(word.lower() for word in prompt_words)),
f"{len(set(word.lower() for word in prompt_words)) / len(prompt_words):.2f}" if prompt_words else "0"
],
'Generated': [
len(generated_words),
f"{generated_avg_word_len:.2f}",
len(set(word.lower() for word in generated_words)),
f"{len(set(word.lower() for word in generated_words)) / len(generated_words):.2f}" if generated_words else "0"
]
}
stats_df = pd.DataFrame(stats_data)
output_html.append('<div class="mt-3">')
output_html.append(df_to_html_table(stats_df))
output_html.append('<p><small>*Lexical diversity = unique words / total words</small></p>')
output_html.append('</div>')
# Show tips for better results
output_html.append("""
<div class="alert alert-info">
<h4>Tips for Better Generation Results</h4>
<ul class="mb-0">
<li><b>Be specific</b> - More detailed prompts give the model better context</li>
<li><b>Format matters</b> - If you want a list, start with a list item; if you want dialogue, include dialogue format</li>
<li><b>Play with temperature</b> - Lower values (0.3-0.5) for focused, consistent text; higher values (0.7-1.0) for creative, varied output</li>
<li><b>Try multiple generations</b> - Generate several options to pick the best result</li>
</ul>
</div>
""")
except Exception as e:
output_html.append(f"""
<div class="alert alert-danger">
<h3>Error</h3>
<p>Failed to generate text: {str(e)}</p>
</div>
""")
# About Text Generation section
output_html.append("""
<div class="card mt-4">
<div class="card-header">
<h4 class="mb-0">
<i class="fas fa-info-circle"></i>
About Text Generation
</h4>
</div>
<div class="card-body">
<h5>What is Text Generation?</h5>
<p>Text generation is the task of creating human-like text using machine learning models. Modern text generation
systems use large neural networks trained on vast amounts of text data to predict the next tokens in a sequence.</p>
<h5>How It Works:</h5>
<ol>
<li><b>Training</b> - Models learn patterns in language by predicting the next word in billions of text examples</li>
<li><b>Prompting</b> - You provide a starting text that gives context and direction</li>
<li><b>Generation</b> - The model repeatedly predicts the most likely next token based on previous context</li>
<li><b>Sampling</b> - Various techniques (temperature, top-p) control the randomness and creativity of output</li>
</ol>
<h5>Applications:</h5>
<ul>
<li><b>Content creation</b> - Drafting articles, stories, and marketing copy</li>
<li><b>Assistive writing</b> - Helping with email drafting, summarization, and editing</li>
<li><b>Conversational AI</b> - Powering chatbots and digital assistants</li>
<li><b>Code generation</b> - Assisting developers with coding tasks</li>
<li><b>Creative writing</b> - Generating stories, poetry, and other creative content</li>
</ul>
</div>
</div>
""")
output_html.append('</div>') # Close result-area div
return '\n'.join(output_html)
|