Spaces:

LocaleNLP
/

eng_wol

Sleeping

App Files Files Community

Mgolo commited on Aug 12

Commit

79d1580

verified ·

1 Parent(s): fec6a5a

Upload 3 files

Browse files

Files changed (3) hide show

app.py +169 -0
localenpl5.jpeg +0 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import os
+# Redirect Streamlit config/cache to a writable temp folder
+os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
+import streamlit as st
+from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
+import torch
+import unicodedata
+import re
+import whisper
+import tempfile
+import os
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+import fitz  # PyMuPDF for PDF
+import docx
+from bs4 import BeautifulSoup
+import markdown2
+import chardet
+# --- Device selection ---
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# --- Load Wolof MarianMT model from HF hub ---
+@st.cache_resource
+def load_wolof_model():
+    model_name = "LocaleNLP/eng_wolof"
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)
+    return translator
+@st.cache_resource
+def load_whisper_model():
+    return whisper.load_model("base")
+def transcribe_audio(audio_path):
+    whisper_model = load_whisper_model()
+    return whisper_model.transcribe(audio_path)["text"]
+def translate(text):
+    translator = load_wolof_model()
+    lang_tag = ">>wol<<"
+    paragraphs = text.split("\n")
+    translated_output = []
+    with torch.no_grad():
+        for para in paragraphs:
+            if not para.strip():
+                translated_output.append("")
+                continue
+            sentences = [s.strip() for s in para.split('. ') if s.strip()]
+            formatted = [f"{lang_tag} {s}" for s in sentences]
+            results = translator(formatted,
+                                 max_length=5000,
+                                 num_beams=5,
+                                 early_stopping=True,
+                                 no_repeat_ngram_size=3,
+                                 repetition_penalty=1.5,
+                                 length_penalty=1.2)
+            translated_sentences = [r['translation_text'].capitalize() for r in results]
+            translated_output.append('. '.join(translated_sentences))
+    return "\n".join(translated_output)
+def extract_text_from_file(uploaded_file):
+    file_type = uploaded_file.name.split('.')[-1].lower()
+    content = uploaded_file.read()
+    if file_type == "pdf":
+        with fitz.open(stream=content, filetype="pdf") as doc:
+            return "\n".join([page.get_text() for page in doc])
+    elif file_type == "docx":
+        doc = docx.Document(uploaded_file)
+        return "\n".join([para.text for para in doc.paragraphs])
+    else:
+        encoding = chardet.detect(content)['encoding']
+        if encoding:
+            content = content.decode(encoding, errors='ignore')
+        if file_type in ("html", "htm"):
+            soup = BeautifulSoup(content, "html.parser")
+            return soup.get_text()
+        elif file_type == "md":
+            html = markdown2.markdown(content)
+            soup = BeautifulSoup(html, "html.parser")
+            return soup.get_text()
+        elif file_type == "srt":
+            return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
+        elif file_type in ("txt", "text"):
+            return content
+        else:
+            raise ValueError("Unsupported file type")
+def main():
+    st.set_page_config(page_title="LocaleNLP English-to-Wolof Translator", layout="wide", initial_sidebar_state="expanded")
+    with st.sidebar:
+        st.image("localenpl5.jpeg", use_container_width=True)
+        st.markdown("""
+        <h3 style='text-align: left; color: #4B8BBE;'>🌐 Wolof Translation</h3>
+        This app translates English text to Wolof (Senegal) using a custom MarianMT model hosted on Hugging Face.
+        """, unsafe_allow_html=True)
+    st.markdown("<h4 style='text-align: center; color: #306998;'>Translate English to Wolof</h4>", unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        input_mode = st.selectbox("Select input mode:", ("Text", "Audio", "File"))
+        st.markdown("<hr>", unsafe_allow_html=True)
+    with col2:
+        target_lang = "Wolof (Senegal)"
+        st.markdown(f"<p><b>Target language:</b> {target_lang}</p>", unsafe_allow_html=True)
+        st.markdown("<hr>", unsafe_allow_html=True)
+    col3, col4 = st.columns(2)
+    with col3:
+        input_text = ""
+        if input_mode == "Text":
+            input_text = st.text_area("✏️ Enter English text:", height=250)
+        elif input_mode == "Audio":
+            audio_file = st.file_uploader("🔊 Upload audio (.wav, .mp3, .m4a)", type=["wav", "mp3", "m4a"])
+            if audio_file:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=f".{audio_file.type.split('/')[-1]}") as tmp:
+                    tmp.write(audio_file.read())
+                    tmp_path = tmp.name
+                with st.spinner("Transcribing..."):
+                    input_text = transcribe_audio(tmp_path)
+                    os.remove(tmp_path)
+                    st.text_area("📝 Transcribed Text:", value=input_text, height=150)
+        elif input_mode == "File":
+            uploaded_file = st.file_uploader("📄 Upload document (PDF, Word, HTML, Markdown, SRT, TXT)",
+                                             type=["pdf", "docx", "html", "htm", "md", "srt", "txt"])
+            if uploaded_file:
+                try:
+                    input_text = extract_text_from_file(uploaded_file)
+                    st.text_area("📃 Extracted Text:", value=input_text, height=200)
+                except Exception as e:
+                    st.error(f"Error extracting text: {str(e)}")
+    with col4:
+        if input_text:
+            with st.spinner("Translating..."):
+                translated_text = translate(input_text)
+            st.write(f"Output in {target_lang}")
+            st.success(translated_text)
+            st.download_button(
+                label="💾 Download Translation",
+                data=translated_text,
+                file_name=f"translated_{target_lang.replace(' ', '_').lower()}.txt",
+                mime="text/plain"
+            )
+        else:
+            st.info("Translation will appear here.")
+    st.markdown("""
+    <hr>
+    <div style='text-align: center; color: #4B8BBE; font-size: 0.9rem'>
+        LocaleNLP © 2025 • Empowering communities through AI and language
+    </div>""", unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

localenpl5.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+altair
+pandas
+streamlit
+transformers
+torch
+whisper
+nltk
+PyMuPDF
+python-docx
+beautifulsoup4
+markdown2
+chardet