glitz-dev commited on
Commit
3b3e0b9
·
1 Parent(s): cf1842c

split hipaa files to questions, annotations- given by nataraj

Browse files
Files changed (4) hide show
  1. hipaathesis.py +465 -81
  2. pubtator_annotator.py +50 -0
  3. questions.py +20 -0
  4. static/thesis.pdf +0 -0
hipaathesis.py CHANGED
@@ -1,7 +1,81 @@
1
  import PyPDF2
2
  import re
3
  from collections import Counter
 
4
  import nltk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from nltk.tokenize import sent_tokenize, word_tokenize
6
  from nltk.corpus import stopwords
7
  from nltk.stem import WordNetLemmatizer
@@ -9,7 +83,7 @@ import string
9
  from datetime import datetime, timedelta
10
  import json
11
  import torch
12
- from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, BlipProcessor, BlipForConditionalGeneration
13
  import warnings
14
  import fitz # PyMuPDF
15
  from PIL import Image, ImageEnhance, ImageFilter
@@ -39,27 +113,57 @@ except ImportError:
39
  OPENCV_AVAILABLE = False
40
  import numpy as np
41
 
 
 
 
42
  warnings.filterwarnings('ignore')
43
 
44
  app = FastAPI(title='AI (PDF→Summary+QnA+Scores)', version='0.2.1')
45
- app.mount("/test", StaticFiles(directory="test"), name="test")
46
 
47
  class HIPAALogger:
48
  """HIPAA-compliant audit logging system"""
49
 
50
  def __init__(self, log_file="hipaa_audit.log"):
51
- self.log_file = log_file
 
 
 
 
 
 
52
  self.setup_logging()
53
 
54
  def setup_logging(self):
55
- """Setup secure audit logging"""
56
- logging.basicConfig(
57
- filename=self.log_file,
58
- level=logging.INFO,
59
- format='%(asctime)s - %(levelname)s - %(message)s',
60
- datefmt='%Y-%m-%d %H:%M:%S'
61
- )
62
- self.logger = logging.getLogger('HIPAA_AUDIT')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def log_access(self, user_id, action, resource, success=True):
65
  """Log access attempts and actions"""
@@ -111,13 +215,43 @@ class SecureFileHandler:
111
 
112
  def secure_save(self, data, filepath):
113
  """Save data with encryption"""
114
- if self.fernet:
115
- encrypted_data = self.encrypt_data(json.dumps(data))
116
- with open(filepath + '.enc', 'wb') as f:
117
- f.write(encrypted_data)
118
- else:
119
- with open(filepath, 'w', encoding='utf-8') as f:
120
- json.dump(data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def secure_load(self, filepath):
123
  """Load encrypted data"""
@@ -156,11 +290,26 @@ class SecureFileHandler:
156
  class HIPAACompliantThesisAnalyzer:
157
  """HIPAA-compliant version of the thesis analyzer"""
158
 
159
- def __init__(self, user_id=None, password=None, session_timeout=30):
160
  self.user_id = user_id or getpass.getuser()
161
  self.session_timeout = session_timeout # minutes
162
  self.session_start = datetime.now()
163
  self.last_activity = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # Initialize HIPAA compliance components
166
  self.hipaa_logger = HIPAALogger()
@@ -184,8 +333,19 @@ class HIPAACompliantThesisAnalyzer:
184
  except LookupError as e:
185
  print(f"NLTK resource error: {e}")
186
  self._download_nltk_resources()
187
- self.lemmatizer = WordNetLemmatizer()
188
- self.stop_words = set(stopwords.words('english'))
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  self.thesis_text = ""
191
  self.sentences = []
@@ -196,36 +356,66 @@ class HIPAACompliantThesisAnalyzer:
196
  self.use_ocr = True
197
  self.use_blip = True
198
 
199
- # Initialize T5 model
200
- print("Loading T5-small model (HIPAA-compliant local processing)...")
201
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
202
 
203
- self.model_name = "t5-small"
204
- self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
205
- self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
206
- self.model.to(self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  # Initialize pipelines
209
- self.summarizer = pipeline(
210
- "summarization",
211
- model=self.model_name,
212
- tokenizer=self.model_name,
213
- device=0 if torch.cuda.is_available() else -1,
214
- max_length=200,
215
- min_length=150,
216
- do_sample=True,
217
- temperature=0.7
218
- )
 
219
 
220
- self.qa_pipeline = pipeline(
221
- "text2text-generation",
222
- model=self.model_name,
223
- tokenizer=self.model_name,
224
- device=0 if torch.cuda.is_available() else -1,
225
- max_length=512,
226
- do_sample=True,
227
- temperature=0.7
228
- )
 
 
 
 
 
 
229
 
230
  # Initialize BLIP if enabled
231
  if self.use_blip:
@@ -248,7 +438,12 @@ class HIPAACompliantThesisAnalyzer:
248
  self.use_ocr = False
249
 
250
  def _download_nltk_resources(self):
251
- """Download required NLTK resources"""
 
 
 
 
 
252
  resources = [
253
  ('tokenizers/punkt', 'punkt'),
254
  ('tokenizers/punkt_tab', 'punkt_tab'),
@@ -262,7 +457,8 @@ class HIPAACompliantThesisAnalyzer:
262
  nltk.data.find(resource_path)
263
  except LookupError:
264
  try:
265
- nltk.download(resource_name, quiet=True)
 
266
  except Exception as e:
267
  print(f"Warning: Failed to download {resource_name}: {e}")
268
 
@@ -281,8 +477,8 @@ class HIPAACompliantThesisAnalyzer:
281
  """Calculate secure hash of document content"""
282
  return hashlib.sha256(content.encode()).hexdigest()
283
 
284
- def process_document_securely(self, pdf_path, questions, output_file=None):
285
- """Process document with full HIPAA compliance"""
286
  self.check_session_timeout()
287
 
288
  # Calculate document hash for audit trail
@@ -313,6 +509,17 @@ class HIPAACompliantThesisAnalyzer:
313
  ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
314
  combined_text = text + " " + ocr_text
315
 
 
 
 
 
 
 
 
 
 
 
 
316
  # Generate analysis
317
  sections = self._extract_key_sections(combined_text)
318
  key_terms = self._extract_key_terms(combined_text)
@@ -337,7 +544,7 @@ class HIPAACompliantThesisAnalyzer:
337
  "document_info": {
338
  "file_path": os.path.basename(pdf_path), # Only filename for privacy
339
  "analysis_timestamp": datetime.now().isoformat(),
340
- "total_characters": len(text),
341
  "total_images": len(images),
342
  "device_used": str(self.device)
343
  },
@@ -354,8 +561,8 @@ class HIPAACompliantThesisAnalyzer:
354
  },
355
  "question_responses": question_answers,
356
  "statistics": {
357
- "total_text_characters": len(text),
358
- "ocr_text_characters": len(ocr_text),
359
  "questions_processed": len(questions),
360
  "sections_identified": len(sections),
361
  "key_terms_extracted": len(key_terms)
@@ -372,6 +579,102 @@ class HIPAACompliantThesisAnalyzer:
372
  except Exception as e:
373
  self.hipaa_logger.log_access(self.user_id, "PROCESSING_ERROR", pdf_path, success=False)
374
  raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  def _extract_text_and_images(self, pdf_path):
377
  """Securely extract text and images from PDF"""
@@ -534,13 +837,25 @@ class HIPAACompliantThesisAnalyzer:
534
  """Extract key terms securely"""
535
  try:
536
  words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
537
- words = [
538
- self.lemmatizer.lemmatize(word)
539
- for word in words
540
- if word not in self.stop_words
541
- and len(word) > 3
542
- and word.isalpha()
543
- ]
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
  word_freq = Counter(words)
546
  return [term for term, freq in word_freq.most_common(20)]
@@ -552,6 +867,12 @@ class HIPAACompliantThesisAnalyzer:
552
  def _generate_summary_secure(self, text):
553
  """Generate summary using local T5 model"""
554
  try:
 
 
 
 
 
 
555
  clean_text = re.sub(r'\s+', ' ', text).strip()
556
 
557
  # Chunk text for processing
@@ -581,6 +902,14 @@ class HIPAACompliantThesisAnalyzer:
581
 
582
  for question in questions:
583
  try:
 
 
 
 
 
 
 
 
584
  prompt = f"question: {question} context: {text[:1000]}"
585
 
586
  answer_result = self.qa_pipeline(
@@ -635,6 +964,77 @@ class AnalyzeReq(BaseModel):
635
  userId:str
636
  password:str
637
  useEncryption: bool =False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
  @app.post('/analyze')
640
  def analyze(req: AnalyzeReq):
@@ -647,30 +1047,14 @@ def analyze(req: AnalyzeReq):
647
  analyzer = HIPAACompliantThesisAnalyzer(
648
  user_id=req.userId,
649
  password=req.password,
650
- session_timeout=30
 
651
  )
652
 
653
  pdf_path = req.storageKey
654
 
655
- # Sample questions
656
- questions = [
657
- "What is the main objective of the research?",
658
- "What methodology was used in the study?",
659
- "What are the key findings or results?",
660
- "What conclusions did the authors draw?",
661
- "What are the limitations of the study?",
662
- "What motivated the researchers to conduct this study?",
663
- "How does this research relate to existing literature?",
664
- "What are the practical implications of the findings?",
665
- "What assumptions underlie the research?",
666
- "What statistical methods were used to analyze the data?",
667
- "How robust are the study’s findings?",
668
- "Are there any potential biases in the study design or data collection?",
669
- "How do the results compare with previous studies on the same topic?",
670
- "What are the potential future applications of this research?",
671
- "How could this research be expanded or built upon in future studies?",
672
- "What new questions have emerged as a result of this study?"
673
- ]
674
 
675
  # Process document securely
676
  print("\nProcessing document with HIPAA compliance...")
@@ -731,4 +1115,4 @@ COMPLIANCE NOTES:
731
 
732
  """)
733
 
734
- #main()
 
1
  import PyPDF2
2
  import re
3
  from collections import Counter
4
+ import os
5
  import nltk
6
+
7
+ def setup_cache_directories():
8
+ """Setup cache directories for transformers and torch with proper permissions"""
9
+ try:
10
+ # Create cache directories in /app with proper permissions
11
+ cache_dirs = [
12
+ '/app/.cache/huggingface',
13
+ '/app/.cache/torch',
14
+ '/root/.cache/huggingface',
15
+ '/root/.cache/torch'
16
+ ]
17
+
18
+ for cache_dir in cache_dirs:
19
+ os.makedirs(cache_dir, exist_ok=True)
20
+ # Set permissions to be writable
21
+ os.chmod(cache_dir, 0o777)
22
+
23
+ # Set environment variables for cache directories
24
+ os.environ['HF_HOME'] = '/app/.cache/huggingface'
25
+ os.environ['TRANSFORMERS_CACHE'] = '/app/.cache/huggingface'
26
+ os.environ['TORCH_HOME'] = '/app/.cache/torch'
27
+
28
+ print(f"Cache directories setup complete: {cache_dirs}")
29
+
30
+ except Exception as e:
31
+ print(f"Warning: Cache directory setup failed: {e}")
32
+
33
+ # Set NLTK data path BEFORE any other NLTK imports
34
+ def setup_nltk_data():
35
+ """Setup NLTK data directory in container-writable location"""
36
+ try:
37
+ # Use the app directory for NLTK data in container
38
+ nltk_data_dir = '/app/nltk_data'
39
+
40
+ # Ensure directory exists and is writable
41
+ os.makedirs(nltk_data_dir, exist_ok=True)
42
+
43
+ # Set NLTK data path - this must be done first
44
+ nltk.data.path.clear()
45
+ nltk.data.path.append(nltk_data_dir)
46
+
47
+ # Also set the NLTK_DATA environment variable
48
+ os.environ['NLTK_DATA'] = nltk_data_dir
49
+
50
+ # Setup cache directories for transformers and torch
51
+ setup_cache_directories()
52
+
53
+ # Download required resources if not present
54
+ required_resources = [
55
+ 'punkt',
56
+ 'punkt_tab',
57
+ 'stopwords',
58
+ 'wordnet',
59
+ 'omw-1.4'
60
+ ]
61
+
62
+ for resource in required_resources:
63
+ try:
64
+ nltk.data.find(f'tokenizers/{resource}' if 'punkt' in resource else f'corpora/{resource}')
65
+ except LookupError:
66
+ try:
67
+ nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
68
+ print(f"Downloaded NLTK resource: {resource}")
69
+ except Exception as e:
70
+ print(f"Warning: Could not download {resource}: {e}")
71
+
72
+ except Exception as e:
73
+ print(f"Warning: NLTK setup failed: {e}")
74
+
75
+ # Call setup immediately after basic imports
76
+ setup_nltk_data()
77
+
78
+ # Now import NLTK modules after setup
79
  from nltk.tokenize import sent_tokenize, word_tokenize
80
  from nltk.corpus import stopwords
81
  from nltk.stem import WordNetLemmatizer
 
83
  from datetime import datetime, timedelta
84
  import json
85
  import torch
86
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
87
  import warnings
88
  import fitz # PyMuPDF
89
  from PIL import Image, ImageEnhance, ImageFilter
 
113
  OPENCV_AVAILABLE = False
114
  import numpy as np
115
 
116
+ from questions import THESIS_QUESTIONS
117
+ from pubtator_annotator import PubTatorAnnotator
118
+
119
  warnings.filterwarnings('ignore')
120
 
121
  app = FastAPI(title='AI (PDF→Summary+QnA+Scores)', version='0.2.1')
122
+ app.mount("/static", StaticFiles(directory="static"), name="static")
123
 
124
  class HIPAALogger:
125
  """HIPAA-compliant audit logging system"""
126
 
127
  def __init__(self, log_file="hipaa_audit.log"):
128
+ # Create logs directory if it doesn't exist
129
+ log_dir = '/app/logs'
130
+ os.makedirs(log_dir, exist_ok=True)
131
+
132
+ # Use the new log file path
133
+ self.log_file = os.path.join(log_dir, log_file)
134
+ self.logger = None
135
  self.setup_logging()
136
 
137
  def setup_logging(self):
138
+ """Setup secure audit logging with fallback to console"""
139
+ try:
140
+ # Try to create file handler
141
+ logging.basicConfig(
142
+ filename=self.log_file,
143
+ level=logging.INFO,
144
+ format='%(asctime)s - %(levelname)s - %(message)s',
145
+ datefmt='%Y-%m-%d %H:%M:%S'
146
+ )
147
+ self.logger = logging.getLogger('HIPAA_AUDIT')
148
+ print(f"HIPAA logging initialized: {self.log_file}")
149
+ except PermissionError:
150
+ # Fallback to console logging if file writing fails
151
+ logging.basicConfig(
152
+ level=logging.INFO,
153
+ format='%(asctime)s - %(levelname)s - %(message)s',
154
+ datefmt='%Y-%m-%d %H:%M:%S'
155
+ )
156
+ self.logger = logging.getLogger('HIPAA_AUDIT')
157
+ print(f"Warning: Cannot write to {self.log_file}, using console logging")
158
+ except Exception as e:
159
+ # Fallback to console logging for any other error
160
+ logging.basicConfig(
161
+ level=logging.INFO,
162
+ format='%(asctime)s - %(levelname)s - %(message)s',
163
+ datefmt='%Y-%m-%d %H:%M:%S'
164
+ )
165
+ self.logger = logging.getLogger('HIPAA_AUDIT')
166
+ print(f"Warning: Logging setup failed ({e}), using console logging")
167
 
168
  def log_access(self, user_id, action, resource, success=True):
169
  """Log access attempts and actions"""
 
215
 
216
  def secure_save(self, data, filepath):
217
  """Save data with encryption"""
218
+ try:
219
+ if self.fernet:
220
+ encrypted_data = self.encrypt_data(json.dumps(data))
221
+ with open(filepath + '.enc', 'wb') as f:
222
+ f.write(encrypted_data)
223
+ else:
224
+ with open(filepath, 'w', encoding='utf-8') as f:
225
+ json.dump(data, f, indent=2)
226
+ except PermissionError:
227
+ print(f"Warning: Cannot write to {filepath}, saving to /tmp instead")
228
+ # Fallback to /tmp directory
229
+ import tempfile
230
+ temp_path = os.path.join(tempfile.gettempdir(), os.path.basename(filepath))
231
+ if self.fernet:
232
+ encrypted_data = self.encrypt_data(json.dumps(data))
233
+ with open(temp_path + '.enc', 'wb') as f:
234
+ f.write(encrypted_data)
235
+ else:
236
+ with open(temp_path, 'w', encoding='utf-8') as f:
237
+ json.dump(data, f, indent=2)
238
+ print(f"Data saved to: {temp_path}")
239
+ except Exception as e:
240
+ print(f"Error saving data: {e}")
241
+ # Still try to save to /tmp as last resort
242
+ try:
243
+ import tempfile
244
+ temp_path = os.path.join(tempfile.gettempdir(), os.path.basename(filepath))
245
+ if self.fernet:
246
+ encrypted_data = self.encrypt_data(json.dumps(data))
247
+ with open(temp_path + '.enc', 'wb') as f:
248
+ f.write(encrypted_data)
249
+ else:
250
+ with open(temp_path, 'w', encoding='utf-8') as f:
251
+ json.dump(data, f, indent=2)
252
+ print(f"Data saved to fallback location: {temp_path}")
253
+ except Exception as fallback_error:
254
+ print(f"Failed to save data even to fallback location: {fallback_error}")
255
 
256
  def secure_load(self, filepath):
257
  """Load encrypted data"""
 
290
  class HIPAACompliantThesisAnalyzer:
291
  """HIPAA-compliant version of the thesis analyzer"""
292
 
293
+ def __init__(self, user_id=None, password=None, session_timeout=30, model_name="t5-small"):
294
  self.user_id = user_id or getpass.getuser()
295
  self.session_timeout = session_timeout # minutes
296
  self.session_start = datetime.now()
297
  self.last_activity = datetime.now()
298
+ self.model_name = model_name
299
+
300
+ # Map model names to their optimal tasks and parameters
301
+ self.model_configs = {
302
+ "t5-small": {"task": "text2text-generation", "summarizer_task": "summarization"},
303
+ "t5-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
304
+ "t5-large": {"task": "text2text-generation", "summarizer_task": "summarization"},
305
+ "bart-large-cnn": {"task": "text2text-generation", "summarizer_task": "summarization"},
306
+ "facebook/bart-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
307
+ "distilbart-cnn-12-6": {"task": "text2text-generation", "summarizer_task": "summarization"},
308
+ "sshleifer/distilbart-cnn-6-6": {"task": "text2text-generation", "summarizer_task": "summarization"},
309
+ "pegasus-large": {"task": "text2text-generation", "summarizer_task": "summarization"},
310
+ "flan-t5-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
311
+ "flan-t5-large": {"task": "text2text-generation", "summarizer_task": "summarization"}
312
+ }
313
 
314
  # Initialize HIPAA compliance components
315
  self.hipaa_logger = HIPAALogger()
 
333
  except LookupError as e:
334
  print(f"NLTK resource error: {e}")
335
  self._download_nltk_resources()
336
+ try:
337
+ self.lemmatizer = WordNetLemmatizer()
338
+ self.stop_words = set(stopwords.words('english'))
339
+ except Exception as e2:
340
+ print(f"Failed to initialize NLTK after download: {e2}")
341
+ # Fallback to basic functionality
342
+ self.lemmatizer = None
343
+ self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
344
+ except Exception as e:
345
+ print(f"Error initializing NLTK: {e}")
346
+ # Fallback to basic functionality
347
+ self.lemmatizer = None
348
+ self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
349
 
350
  self.thesis_text = ""
351
  self.sentences = []
 
356
  self.use_ocr = True
357
  self.use_blip = True
358
 
359
+ # Initialize Model
360
+ print(f"Loading {self.model_name} model (HIPAA-compliant local processing)...")
361
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
362
 
363
+ try:
364
+ # Try to load with explicit cache directory
365
+ cache_dir = '/app/.cache/huggingface'
366
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=cache_dir)
367
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, cache_dir=cache_dir)
368
+ self.model.to(self.device)
369
+ print(f"{self.model_name} loaded successfully from cache")
370
+ except Exception as e:
371
+ print(f"Error loading {self.model_name}: {e}")
372
+ print("Attempting to load with fallback cache directory...")
373
+ try:
374
+ # Fallback to default cache
375
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
376
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
377
+ self.model.to(self.device)
378
+ print(f"{self.model_name} loaded with fallback cache")
379
+ except Exception as e2:
380
+ print(f"Failed to load {self.model_name}: {e2}")
381
+ # Fallback to t5-small if requested model fails
382
+ if self.model_name != "t5-small":
383
+ print("Falling back to t5-small...")
384
+ self.model_name = "t5-small"
385
+ self.tokenizer = AutoTokenizer.from_pretrained("t5-small")
386
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
387
+ self.model.to(self.device)
388
+ else:
389
+ raise e2
390
 
391
  # Initialize pipelines
392
+ try:
393
+ self.summarizer = pipeline(
394
+ "summarization",
395
+ model=self.model,
396
+ tokenizer=self.tokenizer,
397
+ device=0 if torch.cuda.is_available() else -1,
398
+ max_length=200,
399
+ min_length=50,
400
+ do_sample=True,
401
+ temperature=0.7
402
+ )
403
 
404
+ self.qa_pipeline = pipeline(
405
+ "text2text-generation",
406
+ model=self.model,
407
+ tokenizer=self.tokenizer,
408
+ device=0 if torch.cuda.is_available() else -1,
409
+ max_length=512,
410
+ do_sample=True,
411
+ temperature=0.7
412
+ )
413
+ print("Pipelines initialized successfully")
414
+ except Exception as e:
415
+ print(f"Error initializing pipelines: {e}")
416
+ # Create fallback pipelines
417
+ self.summarizer = None
418
+ self.qa_pipeline = None
419
 
420
  # Initialize BLIP if enabled
421
  if self.use_blip:
 
438
  self.use_ocr = False
439
 
440
  def _download_nltk_resources(self):
441
+ """Download required NLTK resources to user directory"""
442
+ # Use the same user-writable directory
443
+ nltk_data_dir = os.path.join(os.path.expanduser('~'), 'nltk_data')
444
+ os.makedirs(nltk_data_dir, exist_ok=True)
445
+ nltk.data.path.append(nltk_data_dir)
446
+
447
  resources = [
448
  ('tokenizers/punkt', 'punkt'),
449
  ('tokenizers/punkt_tab', 'punkt_tab'),
 
457
  nltk.data.find(resource_path)
458
  except LookupError:
459
  try:
460
+ nltk.download(resource_name, download_dir=nltk_data_dir, quiet=True)
461
+ print(f"Downloaded NLTK resource: {resource_name}")
462
  except Exception as e:
463
  print(f"Warning: Failed to download {resource_name}: {e}")
464
 
 
477
  """Calculate secure hash of document content"""
478
  return hashlib.sha256(content.encode()).hexdigest()
479
 
480
+ def _prepare_document(self, pdf_path):
481
+ """Common method to prepare document for processing (extract text/images/OCR)"""
482
  self.check_session_timeout()
483
 
484
  # Calculate document hash for audit trail
 
509
  ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
510
  combined_text = text + " " + ocr_text
511
 
512
+ return combined_text, images, ocr_results, doc_hash
513
+
514
+ except Exception as e:
515
+ self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
516
+ raise e
517
+
518
+ def process_document_securely(self, pdf_path, questions, output_file=None):
519
+ """Process document with full HIPAA compliance"""
520
+ combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
521
+
522
+ try:
523
  # Generate analysis
524
  sections = self._extract_key_sections(combined_text)
525
  key_terms = self._extract_key_terms(combined_text)
 
544
  "document_info": {
545
  "file_path": os.path.basename(pdf_path), # Only filename for privacy
546
  "analysis_timestamp": datetime.now().isoformat(),
547
+ "total_characters": len(combined_text),
548
  "total_images": len(images),
549
  "device_used": str(self.device)
550
  },
 
561
  },
562
  "question_responses": question_answers,
563
  "statistics": {
564
+ "total_text_characters": len(combined_text),
565
+ "ocr_text_characters": len([r['ocr_text'] for r in ocr_results if r.get('ocr_text')]), # Approximate
566
  "questions_processed": len(questions),
567
  "sections_identified": len(sections),
568
  "key_terms_extracted": len(key_terms)
 
579
  except Exception as e:
580
  self.hipaa_logger.log_access(self.user_id, "PROCESSING_ERROR", pdf_path, success=False)
581
  raise e
582
+
583
+ def process_summary_only(self, pdf_path, output_file=None):
584
+ """Process document for summary only"""
585
+ combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
586
+
587
+ try:
588
+ # Generate summary
589
+ summary = self._generate_summary_secure(combined_text)
590
+ key_terms = self._extract_key_terms(combined_text)
591
+ sections = self._extract_key_sections(combined_text)
592
+
593
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "SUMMARY_COMPLETE")
594
+
595
+ report = {
596
+ "hipaa_compliance": {
597
+ "processed_locally": True,
598
+ "user_id": self.user_id,
599
+ "document_hash": doc_hash,
600
+ "processing_timestamp": datetime.now().isoformat()
601
+ },
602
+ "text_analysis": {
603
+ "summary": summary,
604
+ "key_terms": key_terms[:15],
605
+ "sections_found": list(sections.keys())
606
+ }
607
+ }
608
+
609
+ if output_file:
610
+ self.secure_handler.secure_save(report, output_file)
611
+
612
+ return report
613
+ except Exception as e:
614
+ self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
615
+ raise e
616
+
617
+ def process_questions_only(self, pdf_path, questions, output_file=None):
618
+ """Process document for Q&A only"""
619
+ combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
620
+
621
+ try:
622
+ # Generate answers
623
+ question_answers = self._answer_questions_secure(questions, combined_text)
624
+
625
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "QA_COMPLETE")
626
+
627
+ report = {
628
+ "hipaa_compliance": {
629
+ "processed_locally": True,
630
+ "user_id": self.user_id,
631
+ "document_hash": doc_hash,
632
+ "processing_timestamp": datetime.now().isoformat()
633
+ },
634
+ "question_responses": question_answers
635
+ }
636
+
637
+ if output_file:
638
+ self.secure_handler.secure_save(report, output_file)
639
+
640
+ return report
641
+ except Exception as e:
642
+ self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
643
+ raise e
644
+
645
+ def process_annotations_only(self, pdf_path, output_file=None):
646
+ """Process document for PubTator annotations only"""
647
+ combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
648
+
649
+ try:
650
+ # Initialize PubTator Annotator
651
+ # Note: PubTator legacy API might have issues, but we integrate as requested
652
+ # Using 'Gene' as a valid concept example, though API might still error
653
+ annotator = PubTatorAnnotator(bioconcept="Gene", output_format="JSON")
654
+
655
+ print("Submitting text to PubTator for annotation...")
656
+ annotations = annotator.annotate_text(combined_text)
657
+
658
+ self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "ANNOTATION_COMPLETE")
659
+
660
+ report = {
661
+ "hipaa_compliance": {
662
+ "processed_locally": False, # PubTator is external
663
+ "user_id": self.user_id,
664
+ "document_hash": doc_hash,
665
+ "processing_timestamp": datetime.now().isoformat(),
666
+ "external_api_used": "PubTator Legacy"
667
+ },
668
+ "annotations": annotations if annotations is not None else "Failed to retrieve annotations"
669
+ }
670
+
671
+ if output_file:
672
+ self.secure_handler.secure_save(report, output_file)
673
+
674
+ return report
675
+ except Exception as e:
676
+ self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
677
+ raise e
678
 
679
  def _extract_text_and_images(self, pdf_path):
680
  """Securely extract text and images from PDF"""
 
837
  """Extract key terms securely"""
838
  try:
839
  words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
840
+
841
+ # Handle case where lemmatizer might be None
842
+ if self.lemmatizer is not None:
843
+ words = [
844
+ self.lemmatizer.lemmatize(word)
845
+ for word in words
846
+ if word not in self.stop_words
847
+ and len(word) > 3
848
+ and word.isalpha()
849
+ ]
850
+ else:
851
+ # Fallback without lemmatization
852
+ words = [
853
+ word
854
+ for word in words
855
+ if word not in self.stop_words
856
+ and len(word) > 3
857
+ and word.isalpha()
858
+ ]
859
 
860
  word_freq = Counter(words)
861
  return [term for term, freq in word_freq.most_common(20)]
 
867
  def _generate_summary_secure(self, text):
868
  """Generate summary using local T5 model"""
869
  try:
870
+ if self.summarizer is None:
871
+ print("Summarizer not available, using fallback method")
872
+ # Fallback to extractive summary
873
+ sentences = re.split(r'[.!?]+', text)
874
+ return " ".join(sentences[:3]) + "..."
875
+
876
  clean_text = re.sub(r'\s+', ' ', text).strip()
877
 
878
  # Chunk text for processing
 
902
 
903
  for question in questions:
904
  try:
905
+ if self.qa_pipeline is None:
906
+ answers[question] = {
907
+ 'answer': 'Q&A pipeline not available - using fallback',
908
+ 'method': 'Fallback',
909
+ 'processed_securely': True
910
+ }
911
+ continue
912
+
913
  prompt = f"question: {question} context: {text[:1000]}"
914
 
915
  answer_result = self.qa_pipeline(
 
964
  userId:str
965
  password:str
966
  useEncryption: bool =False
967
+ model_name: Optional[str] = "t5-small"
968
+
969
+ @app.post('/get_summary')
970
+ def get_summary(req: AnalyzeReq):
971
+ """Get summary only"""
972
+ try:
973
+ analyzer = HIPAACompliantThesisAnalyzer(
974
+ user_id=req.userId,
975
+ password=req.password,
976
+ session_timeout=30,
977
+ model_name=req.model_name
978
+ )
979
+
980
+ report = analyzer.process_summary_only(
981
+ pdf_path=req.storageKey,
982
+ output_file="hipaa_summary_only"
983
+ )
984
+
985
+ analyzer.cleanup_session()
986
+ return report
987
+ except Exception as e:
988
+ print(f"Error in get_summary: {e}")
989
+ return {"error": str(e)}
990
+
991
+ @app.post('/get_answer')
992
+ def get_answer(req: AnalyzeReq):
993
+ """Get answers only"""
994
+ try:
995
+ analyzer = HIPAACompliantThesisAnalyzer(
996
+ user_id=req.userId,
997
+ password=req.password,
998
+ session_timeout=30,
999
+ model_name=req.model_name
1000
+ )
1001
+
1002
+ # Use questions from separate file
1003
+ questions = THESIS_QUESTIONS
1004
+
1005
+ report = analyzer.process_questions_only(
1006
+ pdf_path=req.storageKey,
1007
+ questions=questions,
1008
+ output_file="hipaa_answers_only"
1009
+ )
1010
+
1011
+ analyzer.cleanup_session()
1012
+ return report
1013
+ except Exception as e:
1014
+ print(f"Error in get_answer: {e}")
1015
+ return {"error": str(e)}
1016
+
1017
+ @app.post('/get_annotations')
1018
+ def get_annotations(req: AnalyzeReq):
1019
+ """Get PubTator annotations only"""
1020
+ try:
1021
+ analyzer = HIPAACompliantThesisAnalyzer(
1022
+ user_id=req.userId,
1023
+ password=req.password,
1024
+ session_timeout=30,
1025
+ model_name=req.model_name
1026
+ )
1027
+
1028
+ report = analyzer.process_annotations_only(
1029
+ pdf_path=req.storageKey,
1030
+ output_file="hipaa_annotations_only"
1031
+ )
1032
+
1033
+ analyzer.cleanup_session()
1034
+ return report
1035
+ except Exception as e:
1036
+ print(f"Error in get_annotations: {e}")
1037
+ return {"error": str(e)}
1038
 
1039
  @app.post('/analyze')
1040
  def analyze(req: AnalyzeReq):
 
1047
  analyzer = HIPAACompliantThesisAnalyzer(
1048
  user_id=req.userId,
1049
  password=req.password,
1050
+ session_timeout=30,
1051
+ model_name=req.model_name
1052
  )
1053
 
1054
  pdf_path = req.storageKey
1055
 
1056
+ # Use questions from separate file
1057
+ questions = THESIS_QUESTIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
 
1059
  # Process document securely
1060
  print("\nProcessing document with HIPAA compliance...")
 
1115
 
1116
  """)
1117
 
1118
+ #main()
pubtator_annotator.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import json
4
+
5
+ class PubTatorAnnotator:
6
+ SUBMIT_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/public/annotate/submit"
7
+ RECEIVE_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/public/annotate/"
8
+
9
+ def annotate_text(self, text):
10
+ try:
11
+ response = requests.post(self.SUBMIT_URL, json={"text": text})
12
+ response.raise_for_status()
13
+ submit_result = response.json()
14
+ session_id = submit_result.get("session_id")
15
+
16
+ if not session_id:
17
+ print("No session ID returned.")
18
+ return None
19
+
20
+ print(f"Session ID: {session_id}. Waiting for processing...")
21
+ time.sleep(5) # allow server time to annotate
22
+
23
+ return self._retrieve_annotations(session_id)
24
+
25
+ except requests.exceptions.RequestException as e:
26
+ print(f"Error submitting text: {e}")
27
+ return None
28
+
29
+ def _retrieve_annotations(self, session_id):
30
+ try:
31
+ result_url = f"{self.RECEIVE_URL}{session_id}"
32
+ response = requests.get(result_url)
33
+ response.raise_for_status()
34
+ result = response.json()
35
+ return result.get("annotations", [])
36
+
37
+ except Exception as e:
38
+ print(f"Error retrieving result: {e}")
39
+ return None
40
+
41
+
42
+ if __name__ == "__main__":
43
+ annotator = PubTatorAnnotator()
44
+ text = "The p53 tumor suppressor gene is frequently mutated in human cancers."
45
+ results = annotator.annotate_text(text)
46
+
47
+ if results is not None:
48
+ print(json.dumps(results, indent=2))
49
+ else:
50
+ print("No annotations found.")
questions.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Research analysis questions for thesis analyzer
2
+
3
+ THESIS_QUESTIONS = [
4
+ "What is the main objective of the research?",
5
+ "What methodology was used in the study?",
6
+ "What are the key findings or results?",
7
+ "What conclusions did the authors draw?",
8
+ "What are the limitations of the study?",
9
+ "What motivated the researchers to conduct this study?",
10
+ "How does this research relate to existing literature?",
11
+ "What are the practical implications of the findings?",
12
+ "What assumptions underlie the research?",
13
+ "What statistical methods were used to analyze the data?",
14
+ "How robust are the study's findings?",
15
+ "Are there any potential biases in the study design or data collection?",
16
+ "How do the results compare with previous studies on the same topic?",
17
+ "What are the potential future applications of this research?",
18
+ "How could this research be expanded or built upon in future studies?",
19
+ "What new questions have emerged as a result of this study?"
20
+ ]
static/thesis.pdf ADDED
The diff for this file is too large to render. See raw diff