Spaces:

Manu-glitz
/

Extraction

Sleeping

App Files Files Community

glitz-dev commited on 21 days ago

Commit

3b3e0b9

1 Parent(s): cf1842c

split hipaa files to questions, annotations- given by nataraj

Browse files

Files changed (4) hide show

hipaathesis.py +465 -81
pubtator_annotator.py +50 -0
questions.py +20 -0
static/thesis.pdf +0 -0

hipaathesis.py CHANGED Viewed

@@ -1,7 +1,81 @@
 import PyPDF2
 import re
 from collections import Counter
 import nltk
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
@@ -9,7 +83,7 @@ import string
 from datetime import datetime, timedelta
 import json
 import torch
-from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, BlipProcessor, BlipForConditionalGeneration
 import warnings
 import fitz  # PyMuPDF
 from PIL import Image, ImageEnhance, ImageFilter
@@ -39,27 +113,57 @@ except ImportError:
     OPENCV_AVAILABLE = False
     import numpy as np
 warnings.filterwarnings('ignore')
 app = FastAPI(title='AI (PDF→Summary+QnA+Scores)', version='0.2.1')
-app.mount("/test", StaticFiles(directory="test"), name="test")
 class HIPAALogger:
     """HIPAA-compliant audit logging system"""
     def __init__(self, log_file="hipaa_audit.log"):
-        self.log_file = log_file
         self.setup_logging()
     def setup_logging(self):
-        """Setup secure audit logging"""
-        logging.basicConfig(
-            filename=self.log_file,
-            level=logging.INFO,
-            format='%(asctime)s - %(levelname)s - %(message)s',
-            datefmt='%Y-%m-%d %H:%M:%S'
-        )
-        self.logger = logging.getLogger('HIPAA_AUDIT')
     def log_access(self, user_id, action, resource, success=True):
         """Log access attempts and actions"""
@@ -111,13 +215,43 @@ class SecureFileHandler:
     def secure_save(self, data, filepath):
         """Save data with encryption"""
-        if self.fernet:
-            encrypted_data = self.encrypt_data(json.dumps(data))
-            with open(filepath + '.enc', 'wb') as f:
-                f.write(encrypted_data)
-        else:
-            with open(filepath, 'w', encoding='utf-8') as f:
-                json.dump(data, f, indent=2)
     def secure_load(self, filepath):
         """Load encrypted data"""
@@ -156,11 +290,26 @@ class SecureFileHandler:
 class HIPAACompliantThesisAnalyzer:
     """HIPAA-compliant version of the thesis analyzer"""
-    def __init__(self, user_id=None, password=None, session_timeout=30):
         self.user_id = user_id or getpass.getuser()
         self.session_timeout = session_timeout  # minutes
         self.session_start = datetime.now()
         self.last_activity = datetime.now()
         # Initialize HIPAA compliance components
         self.hipaa_logger = HIPAALogger()
@@ -184,8 +333,19 @@ class HIPAACompliantThesisAnalyzer:
         except LookupError as e:
             print(f"NLTK resource error: {e}")
             self._download_nltk_resources()
-            self.lemmatizer = WordNetLemmatizer()
-            self.stop_words = set(stopwords.words('english'))
         self.thesis_text = ""
         self.sentences = []
@@ -196,36 +356,66 @@ class HIPAACompliantThesisAnalyzer:
         self.use_ocr = True
         self.use_blip = True
-        # Initialize T5 model
-        print("Loading T5-small model (HIPAA-compliant local processing)...")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model_name = "t5-small"
-        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
-        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
-        self.model.to(self.device)
         # Initialize pipelines
-        self.summarizer = pipeline(
-            "summarization",
-            model=self.model_name,
-            tokenizer=self.model_name,
-            device=0 if torch.cuda.is_available() else -1,
-            max_length=200,
-            min_length=150,
-            do_sample=True,
-            temperature=0.7
-        )
-        self.qa_pipeline = pipeline(
-            "text2text-generation",
-            model=self.model_name,
-            tokenizer=self.model_name,
-            device=0 if torch.cuda.is_available() else -1,
-            max_length=512,
-            do_sample=True,
-            temperature=0.7
-        )
         # Initialize BLIP if enabled
         if self.use_blip:
@@ -248,7 +438,12 @@ class HIPAACompliantThesisAnalyzer:
                 self.use_ocr = False
     def _download_nltk_resources(self):
-        """Download required NLTK resources"""
         resources = [
             ('tokenizers/punkt', 'punkt'),
             ('tokenizers/punkt_tab', 'punkt_tab'),
@@ -262,7 +457,8 @@ class HIPAACompliantThesisAnalyzer:
                 nltk.data.find(resource_path)
             except LookupError:
                 try:
-                    nltk.download(resource_name, quiet=True)
                 except Exception as e:
                     print(f"Warning: Failed to download {resource_name}: {e}")
@@ -281,8 +477,8 @@ class HIPAACompliantThesisAnalyzer:
         """Calculate secure hash of document content"""
         return hashlib.sha256(content.encode()).hexdigest()
-    def process_document_securely(self, pdf_path, questions, output_file=None):
-        """Process document with full HIPAA compliance"""
         self.check_session_timeout()
         # Calculate document hash for audit trail
@@ -313,6 +509,17 @@ class HIPAACompliantThesisAnalyzer:
             ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
             combined_text = text + " " + ocr_text
             # Generate analysis
             sections = self._extract_key_sections(combined_text)
             key_terms = self._extract_key_terms(combined_text)
@@ -337,7 +544,7 @@ class HIPAACompliantThesisAnalyzer:
                 "document_info": {
                     "file_path": os.path.basename(pdf_path),  # Only filename for privacy
                     "analysis_timestamp": datetime.now().isoformat(),
-                    "total_characters": len(text),
                     "total_images": len(images),
                     "device_used": str(self.device)
                 },
@@ -354,8 +561,8 @@ class HIPAACompliantThesisAnalyzer:
                 },
                 "question_responses": question_answers,
                 "statistics": {
-                    "total_text_characters": len(text),
-                    "ocr_text_characters": len(ocr_text),
                     "questions_processed": len(questions),
                     "sections_identified": len(sections),
                     "key_terms_extracted": len(key_terms)
@@ -372,6 +579,102 @@ class HIPAACompliantThesisAnalyzer:
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "PROCESSING_ERROR", pdf_path, success=False)
             raise e
     def _extract_text_and_images(self, pdf_path):
         """Securely extract text and images from PDF"""
@@ -534,13 +837,25 @@ class HIPAACompliantThesisAnalyzer:
         """Extract key terms securely"""
         try:
             words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
-            words = [
-                self.lemmatizer.lemmatize(word)
-                for word in words
-                if word not in self.stop_words
-                   and len(word) > 3
-                   and word.isalpha()
-            ]
             word_freq = Counter(words)
             return [term for term, freq in word_freq.most_common(20)]
@@ -552,6 +867,12 @@ class HIPAACompliantThesisAnalyzer:
     def _generate_summary_secure(self, text):
         """Generate summary using local T5 model"""
         try:
             clean_text = re.sub(r'\s+', ' ', text).strip()
             # Chunk text for processing
@@ -581,6 +902,14 @@ class HIPAACompliantThesisAnalyzer:
         for question in questions:
             try:
                 prompt = f"question: {question} context: {text[:1000]}"
                 answer_result = self.qa_pipeline(
@@ -635,6 +964,77 @@ class AnalyzeReq(BaseModel):
     userId:str
     password:str
     useEncryption: bool =False
 @app.post('/analyze')
 def analyze(req: AnalyzeReq):
@@ -647,30 +1047,14 @@ def analyze(req: AnalyzeReq):
         analyzer = HIPAACompliantThesisAnalyzer(
             user_id=req.userId,
             password=req.password,
-            session_timeout=30
         )
         pdf_path = req.storageKey
-        # Sample questions
-        questions = [
-            "What is the main objective of the research?",
-            "What methodology was used in the study?",
-            "What are the key findings or results?",
-            "What conclusions did the authors draw?",
-            "What are the limitations of the study?",
-            "What motivated the researchers to conduct this study?",
-            "How does this research relate to existing literature?",
-            "What are the practical implications of the findings?",
-            "What assumptions underlie the research?",
-            "What statistical methods were used to analyze the data?",
-            "How robust are the study’s findings?",
-            "Are there any potential biases in the study design or data collection?",
-            "How do the results compare with previous studies on the same topic?",
-            "What are the potential future applications of this research?",
-            "How could this research be expanded or built upon in future studies?",
-            "What new questions have emerged as a result of this study?"
-        ]
         # Process document securely
         print("\nProcessing document with HIPAA compliance...")
@@ -731,4 +1115,4 @@ COMPLIANCE NOTES:
 """)
-    #main()

 import PyPDF2
 import re
 from collections import Counter
+import os
 import nltk
+def setup_cache_directories():
+    """Setup cache directories for transformers and torch with proper permissions"""
+    try:
+        # Create cache directories in /app with proper permissions
+        cache_dirs = [
+            '/app/.cache/huggingface',
+            '/app/.cache/torch',
+            '/root/.cache/huggingface',
+            '/root/.cache/torch'
+        ]
+        for cache_dir in cache_dirs:
+            os.makedirs(cache_dir, exist_ok=True)
+            # Set permissions to be writable
+            os.chmod(cache_dir, 0o777)
+        # Set environment variables for cache directories
+        os.environ['HF_HOME'] = '/app/.cache/huggingface'
+        os.environ['TRANSFORMERS_CACHE'] = '/app/.cache/huggingface'
+        os.environ['TORCH_HOME'] = '/app/.cache/torch'
+        print(f"Cache directories setup complete: {cache_dirs}")
+    except Exception as e:
+        print(f"Warning: Cache directory setup failed: {e}")
+# Set NLTK data path BEFORE any other NLTK imports
+def setup_nltk_data():
+    """Setup NLTK data directory in container-writable location"""
+    try:
+        # Use the app directory for NLTK data in container
+        nltk_data_dir = '/app/nltk_data'
+        # Ensure directory exists and is writable
+        os.makedirs(nltk_data_dir, exist_ok=True)
+        # Set NLTK data path - this must be done first
+        nltk.data.path.clear()
+        nltk.data.path.append(nltk_data_dir)
+        # Also set the NLTK_DATA environment variable
+        os.environ['NLTK_DATA'] = nltk_data_dir
+        # Setup cache directories for transformers and torch
+        setup_cache_directories()
+        # Download required resources if not present
+        required_resources = [
+            'punkt',
+            'punkt_tab',
+            'stopwords',
+            'wordnet',
+            'omw-1.4'
+        ]
+        for resource in required_resources:
+            try:
+                nltk.data.find(f'tokenizers/{resource}' if 'punkt' in resource else f'corpora/{resource}')
+            except LookupError:
+                try:
+                    nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
+                    print(f"Downloaded NLTK resource: {resource}")
+                except Exception as e:
+                    print(f"Warning: Could not download {resource}: {e}")
+    except Exception as e:
+        print(f"Warning: NLTK setup failed: {e}")
+# Call setup immediately after basic imports
+setup_nltk_data()
+# Now import NLTK modules after setup
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from datetime import datetime, timedelta
 import json
 import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
 import warnings
 import fitz  # PyMuPDF
 from PIL import Image, ImageEnhance, ImageFilter
     OPENCV_AVAILABLE = False
     import numpy as np
+from questions import THESIS_QUESTIONS
+from pubtator_annotator import PubTatorAnnotator
 warnings.filterwarnings('ignore')
 app = FastAPI(title='AI (PDF→Summary+QnA+Scores)', version='0.2.1')
+app.mount("/static", StaticFiles(directory="static"), name="static")
 class HIPAALogger:
     """HIPAA-compliant audit logging system"""
     def __init__(self, log_file="hipaa_audit.log"):
+        # Create logs directory if it doesn't exist
+        log_dir = '/app/logs'
+        os.makedirs(log_dir, exist_ok=True)
+        # Use the new log file path
+        self.log_file = os.path.join(log_dir, log_file)
+        self.logger = None
         self.setup_logging()
     def setup_logging(self):
+        """Setup secure audit logging with fallback to console"""
+        try:
+            # Try to create file handler
+            logging.basicConfig(
+                filename=self.log_file,
+                level=logging.INFO,
+                format='%(asctime)s - %(levelname)s - %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            self.logger = logging.getLogger('HIPAA_AUDIT')
+            print(f"HIPAA logging initialized: {self.log_file}")
+        except PermissionError:
+            # Fallback to console logging if file writing fails
+            logging.basicConfig(
+                level=logging.INFO,
+                format='%(asctime)s - %(levelname)s - %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            self.logger = logging.getLogger('HIPAA_AUDIT')
+            print(f"Warning: Cannot write to {self.log_file}, using console logging")
+        except Exception as e:
+            # Fallback to console logging for any other error
+            logging.basicConfig(
+                level=logging.INFO,
+                format='%(asctime)s - %(levelname)s - %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            self.logger = logging.getLogger('HIPAA_AUDIT')
+            print(f"Warning: Logging setup failed ({e}), using console logging")
     def log_access(self, user_id, action, resource, success=True):
         """Log access attempts and actions"""
     def secure_save(self, data, filepath):
         """Save data with encryption"""
+        try:
+            if self.fernet:
+                encrypted_data = self.encrypt_data(json.dumps(data))
+                with open(filepath + '.enc', 'wb') as f:
+                    f.write(encrypted_data)
+            else:
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, indent=2)
+        except PermissionError:
+            print(f"Warning: Cannot write to {filepath}, saving to /tmp instead")
+            # Fallback to /tmp directory
+            import tempfile
+            temp_path = os.path.join(tempfile.gettempdir(), os.path.basename(filepath))
+            if self.fernet:
+                encrypted_data = self.encrypt_data(json.dumps(data))
+                with open(temp_path + '.enc', 'wb') as f:
+                    f.write(encrypted_data)
+            else:
+                with open(temp_path, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, indent=2)
+            print(f"Data saved to: {temp_path}")
+        except Exception as e:
+            print(f"Error saving data: {e}")
+            # Still try to save to /tmp as last resort
+            try:
+                import tempfile
+                temp_path = os.path.join(tempfile.gettempdir(), os.path.basename(filepath))
+                if self.fernet:
+                    encrypted_data = self.encrypt_data(json.dumps(data))
+                    with open(temp_path + '.enc', 'wb') as f:
+                        f.write(encrypted_data)
+                else:
+                    with open(temp_path, 'w', encoding='utf-8') as f:
+                        json.dump(data, f, indent=2)
+                print(f"Data saved to fallback location: {temp_path}")
+            except Exception as fallback_error:
+                print(f"Failed to save data even to fallback location: {fallback_error}")
     def secure_load(self, filepath):
         """Load encrypted data"""
 class HIPAACompliantThesisAnalyzer:
     """HIPAA-compliant version of the thesis analyzer"""
+    def __init__(self, user_id=None, password=None, session_timeout=30, model_name="t5-small"):
         self.user_id = user_id or getpass.getuser()
         self.session_timeout = session_timeout  # minutes
         self.session_start = datetime.now()
         self.last_activity = datetime.now()
+        self.model_name = model_name
+        # Map model names to their optimal tasks and parameters
+        self.model_configs = {
+            "t5-small": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "t5-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "t5-large": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "bart-large-cnn": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "facebook/bart-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "distilbart-cnn-12-6": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "sshleifer/distilbart-cnn-6-6": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "pegasus-large": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "flan-t5-base": {"task": "text2text-generation", "summarizer_task": "summarization"},
+            "flan-t5-large": {"task": "text2text-generation", "summarizer_task": "summarization"}
+        }
         # Initialize HIPAA compliance components
         self.hipaa_logger = HIPAALogger()
         except LookupError as e:
             print(f"NLTK resource error: {e}")
             self._download_nltk_resources()
+            try:
+                self.lemmatizer = WordNetLemmatizer()
+                self.stop_words = set(stopwords.words('english'))
+            except Exception as e2:
+                print(f"Failed to initialize NLTK after download: {e2}")
+                # Fallback to basic functionality
+                self.lemmatizer = None
+                self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
+        except Exception as e:
+            print(f"Error initializing NLTK: {e}")
+            # Fallback to basic functionality
+            self.lemmatizer = None
+            self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
         self.thesis_text = ""
         self.sentences = []
         self.use_ocr = True
         self.use_blip = True
+        # Initialize Model
+        print(f"Loading {self.model_name} model (HIPAA-compliant local processing)...")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        try:
+            # Try to load with explicit cache directory
+            cache_dir = '/app/.cache/huggingface'
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=cache_dir)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, cache_dir=cache_dir)
+            self.model.to(self.device)
+            print(f"{self.model_name} loaded successfully from cache")
+        except Exception as e:
+            print(f"Error loading {self.model_name}: {e}")
+            print("Attempting to load with fallback cache directory...")
+            try:
+                # Fallback to default cache
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+                self.model.to(self.device)
+                print(f"{self.model_name} loaded with fallback cache")
+            except Exception as e2:
+                print(f"Failed to load {self.model_name}: {e2}")
+                # Fallback to t5-small if requested model fails
+                if self.model_name != "t5-small":
+                    print("Falling back to t5-small...")
+                    self.model_name = "t5-small"
+                    self.tokenizer = AutoTokenizer.from_pretrained("t5-small")
+                    self.model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+                    self.model.to(self.device)
+                else:
+                    raise e2
         # Initialize pipelines
+        try:
+            self.summarizer = pipeline(
+                "summarization",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=0 if torch.cuda.is_available() else -1,
+                max_length=200,
+                min_length=50,
+                do_sample=True,
+                temperature=0.7
+            )
+            self.qa_pipeline = pipeline(
+                "text2text-generation",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=0 if torch.cuda.is_available() else -1,
+                max_length=512,
+                do_sample=True,
+                temperature=0.7
+            )
+            print("Pipelines initialized successfully")
+        except Exception as e:
+            print(f"Error initializing pipelines: {e}")
+            # Create fallback pipelines
+            self.summarizer = None
+            self.qa_pipeline = None
         # Initialize BLIP if enabled
         if self.use_blip:
                 self.use_ocr = False
     def _download_nltk_resources(self):
+        """Download required NLTK resources to user directory"""
+        # Use the same user-writable directory
+        nltk_data_dir = os.path.join(os.path.expanduser('~'), 'nltk_data')
+        os.makedirs(nltk_data_dir, exist_ok=True)
+        nltk.data.path.append(nltk_data_dir)
         resources = [
             ('tokenizers/punkt', 'punkt'),
             ('tokenizers/punkt_tab', 'punkt_tab'),
                 nltk.data.find(resource_path)
             except LookupError:
                 try:
+                    nltk.download(resource_name, download_dir=nltk_data_dir, quiet=True)
+                    print(f"Downloaded NLTK resource: {resource_name}")
                 except Exception as e:
                     print(f"Warning: Failed to download {resource_name}: {e}")
         """Calculate secure hash of document content"""
         return hashlib.sha256(content.encode()).hexdigest()
+    def _prepare_document(self, pdf_path):
+        """Common method to prepare document for processing (extract text/images/OCR)"""
         self.check_session_timeout()
         # Calculate document hash for audit trail
             ocr_text = " ".join([result['ocr_text'] for result in ocr_results if result.get('ocr_text')])
             combined_text = text + " " + ocr_text
+            return combined_text, images, ocr_results, doc_hash
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "PREPARATION_ERROR", pdf_path, success=False)
+            raise e
+    def process_document_securely(self, pdf_path, questions, output_file=None):
+        """Process document with full HIPAA compliance"""
+        combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
+        try:
             # Generate analysis
             sections = self._extract_key_sections(combined_text)
             key_terms = self._extract_key_terms(combined_text)
                 "document_info": {
                     "file_path": os.path.basename(pdf_path),  # Only filename for privacy
                     "analysis_timestamp": datetime.now().isoformat(),
+                    "total_characters": len(combined_text),
                     "total_images": len(images),
                     "device_used": str(self.device)
                 },
                 },
                 "question_responses": question_answers,
                 "statistics": {
+                    "total_text_characters": len(combined_text),
+                    "ocr_text_characters": len([r['ocr_text'] for r in ocr_results if r.get('ocr_text')]), # Approximate
                     "questions_processed": len(questions),
                     "sections_identified": len(sections),
                     "key_terms_extracted": len(key_terms)
         except Exception as e:
             self.hipaa_logger.log_access(self.user_id, "PROCESSING_ERROR", pdf_path, success=False)
             raise e
+    def process_summary_only(self, pdf_path, output_file=None):
+        """Process document for summary only"""
+        combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
+        try:
+            # Generate summary
+            summary = self._generate_summary_secure(combined_text)
+            key_terms = self._extract_key_terms(combined_text)
+            sections = self._extract_key_sections(combined_text)
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "SUMMARY_COMPLETE")
+            report = {
+                "hipaa_compliance": {
+                    "processed_locally": True,
+                    "user_id": self.user_id,
+                    "document_hash": doc_hash,
+                    "processing_timestamp": datetime.now().isoformat()
+                },
+                "text_analysis": {
+                    "summary": summary,
+                    "key_terms": key_terms[:15],
+                    "sections_found": list(sections.keys())
+                }
+            }
+            if output_file:
+                self.secure_handler.secure_save(report, output_file)
+            return report
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "SUMMARY_ERROR", pdf_path, success=False)
+            raise e
+    def process_questions_only(self, pdf_path, questions, output_file=None):
+        """Process document for Q&A only"""
+        combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
+        try:
+            # Generate answers
+            question_answers = self._answer_questions_secure(questions, combined_text)
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "QA_COMPLETE")
+            report = {
+                "hipaa_compliance": {
+                    "processed_locally": True,
+                    "user_id": self.user_id,
+                    "document_hash": doc_hash,
+                    "processing_timestamp": datetime.now().isoformat()
+                },
+                "question_responses": question_answers
+            }
+            if output_file:
+                self.secure_handler.secure_save(report, output_file)
+            return report
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "QA_ERROR", pdf_path, success=False)
+            raise e
+    def process_annotations_only(self, pdf_path, output_file=None):
+        """Process document for PubTator annotations only"""
+        combined_text, images, ocr_results, doc_hash = self._prepare_document(pdf_path)
+        try:
+            # Initialize PubTator Annotator
+            # Note: PubTator legacy API might have issues, but we integrate as requested
+            # Using 'Gene' as a valid concept example, though API might still error
+            annotator = PubTatorAnnotator(bioconcept="Gene", output_format="JSON")
+            print("Submitting text to PubTator for annotation...")
+            annotations = annotator.annotate_text(combined_text)
+            self.hipaa_logger.log_phi_processing(self.user_id, doc_hash, "ANNOTATION_COMPLETE")
+            report = {
+                "hipaa_compliance": {
+                    "processed_locally": False, # PubTator is external
+                    "user_id": self.user_id,
+                    "document_hash": doc_hash,
+                    "processing_timestamp": datetime.now().isoformat(),
+                    "external_api_used": "PubTator Legacy"
+                },
+                "annotations": annotations if annotations is not None else "Failed to retrieve annotations"
+            }
+            if output_file:
+                self.secure_handler.secure_save(report, output_file)
+            return report
+        except Exception as e:
+            self.hipaa_logger.log_access(self.user_id, "ANNOTATION_ERROR", pdf_path, success=False)
+            raise e
     def _extract_text_and_images(self, pdf_path):
         """Securely extract text and images from PDF"""
         """Extract key terms securely"""
         try:
             words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
+            # Handle case where lemmatizer might be None
+            if self.lemmatizer is not None:
+                words = [
+                    self.lemmatizer.lemmatize(word)
+                    for word in words
+                    if word not in self.stop_words
+                       and len(word) > 3
+                       and word.isalpha()
+                ]
+            else:
+                # Fallback without lemmatization
+                words = [
+                    word
+                    for word in words
+                    if word not in self.stop_words
+                       and len(word) > 3
+                       and word.isalpha()
+                ]
             word_freq = Counter(words)
             return [term for term, freq in word_freq.most_common(20)]
     def _generate_summary_secure(self, text):
         """Generate summary using local T5 model"""
         try:
+            if self.summarizer is None:
+                print("Summarizer not available, using fallback method")
+                # Fallback to extractive summary
+                sentences = re.split(r'[.!?]+', text)
+                return " ".join(sentences[:3]) + "..."
             clean_text = re.sub(r'\s+', ' ', text).strip()
             # Chunk text for processing
         for question in questions:
             try:
+                if self.qa_pipeline is None:
+                    answers[question] = {
+                        'answer': 'Q&A pipeline not available - using fallback',
+                        'method': 'Fallback',
+                        'processed_securely': True
+                    }
+                    continue
                 prompt = f"question: {question} context: {text[:1000]}"
                 answer_result = self.qa_pipeline(
     userId:str
     password:str
     useEncryption: bool =False
+    model_name: Optional[str] = "t5-small"
+@app.post('/get_summary')
+def get_summary(req: AnalyzeReq):
+    """Get summary only"""
+    try:
+        analyzer = HIPAACompliantThesisAnalyzer(
+            user_id=req.userId,
+            password=req.password,
+            session_timeout=30,
+            model_name=req.model_name
+        )
+        report = analyzer.process_summary_only(
+            pdf_path=req.storageKey,
+            output_file="hipaa_summary_only"
+        )
+        analyzer.cleanup_session()
+        return report
+    except Exception as e:
+        print(f"Error in get_summary: {e}")
+        return {"error": str(e)}
+@app.post('/get_answer')
+def get_answer(req: AnalyzeReq):
+    """Get answers only"""
+    try:
+        analyzer = HIPAACompliantThesisAnalyzer(
+            user_id=req.userId,
+            password=req.password,
+            session_timeout=30,
+            model_name=req.model_name
+        )
+        # Use questions from separate file
+        questions = THESIS_QUESTIONS
+        report = analyzer.process_questions_only(
+            pdf_path=req.storageKey,
+            questions=questions,
+            output_file="hipaa_answers_only"
+        )
+        analyzer.cleanup_session()
+        return report
+    except Exception as e:
+        print(f"Error in get_answer: {e}")
+        return {"error": str(e)}
+@app.post('/get_annotations')
+def get_annotations(req: AnalyzeReq):
+    """Get PubTator annotations only"""
+    try:
+        analyzer = HIPAACompliantThesisAnalyzer(
+            user_id=req.userId,
+            password=req.password,
+            session_timeout=30,
+            model_name=req.model_name
+        )
+        report = analyzer.process_annotations_only(
+            pdf_path=req.storageKey,
+            output_file="hipaa_annotations_only"
+        )
+        analyzer.cleanup_session()
+        return report
+    except Exception as e:
+        print(f"Error in get_annotations: {e}")
+        return {"error": str(e)}
 @app.post('/analyze')
 def analyze(req: AnalyzeReq):
         analyzer = HIPAACompliantThesisAnalyzer(
             user_id=req.userId,
             password=req.password,
+            session_timeout=30,
+            model_name=req.model_name
         )
         pdf_path = req.storageKey
+        # Use questions from separate file
+        questions = THESIS_QUESTIONS
         # Process document securely
         print("\nProcessing document with HIPAA compliance...")
 """)
+    #main()

pubtator_annotator.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import requests
+import time
+import json
+class PubTatorAnnotator:
+    SUBMIT_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/public/annotate/submit"
+    RECEIVE_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/public/annotate/"
+    def annotate_text(self, text):
+        try:
+            response = requests.post(self.SUBMIT_URL, json={"text": text})
+            response.raise_for_status()
+            submit_result = response.json()
+            session_id = submit_result.get("session_id")
+            if not session_id:
+                print("No session ID returned.")
+                return None
+            print(f"Session ID: {session_id}. Waiting for processing...")
+            time.sleep(5)  # allow server time to annotate
+            return self._retrieve_annotations(session_id)
+        except requests.exceptions.RequestException as e:
+            print(f"Error submitting text: {e}")
+            return None
+    def _retrieve_annotations(self, session_id):
+        try:
+            result_url = f"{self.RECEIVE_URL}{session_id}"
+            response = requests.get(result_url)
+            response.raise_for_status()
+            result = response.json()
+            return result.get("annotations", [])
+        except Exception as e:
+            print(f"Error retrieving result: {e}")
+            return None
+if __name__ == "__main__":
+    annotator = PubTatorAnnotator()
+    text = "The p53 tumor suppressor gene is frequently mutated in human cancers."
+    results = annotator.annotate_text(text)
+    if results is not None:
+        print(json.dumps(results, indent=2))
+    else:
+        print("No annotations found.")

questions.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Research analysis questions for thesis analyzer
+THESIS_QUESTIONS = [
+    "What is the main objective of the research?",
+    "What methodology was used in the study?",
+    "What are the key findings or results?",
+    "What conclusions did the authors draw?",
+    "What are the limitations of the study?",
+    "What motivated the researchers to conduct this study?",
+    "How does this research relate to existing literature?",
+    "What are the practical implications of the findings?",
+    "What assumptions underlie the research?",
+    "What statistical methods were used to analyze the data?",
+    "How robust are the study's findings?",
+    "Are there any potential biases in the study design or data collection?",
+    "How do the results compare with previous studies on the same topic?",
+    "What are the potential future applications of this research?",
+    "How could this research be expanded or built upon in future studies?",
+    "What new questions have emerged as a result of this study?"
+]

static/thesis.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff