Mgolo commited on
Commit
79d1580
Β·
verified Β·
1 Parent(s): fec6a5a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +169 -0
  2. localenpl5.jpeg +0 -0
  3. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os
3
+ # Redirect Streamlit config/cache to a writable temp folder
4
+ os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
5
+
6
+ import streamlit as st
7
+ from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
8
+ import torch
9
+ import unicodedata
10
+ import re
11
+ import whisper
12
+ import tempfile
13
+ import os
14
+
15
+ import nltk
16
+ nltk.download('punkt')
17
+ from nltk.tokenize import sent_tokenize
18
+
19
+ import fitz # PyMuPDF for PDF
20
+ import docx
21
+ from bs4 import BeautifulSoup
22
+ import markdown2
23
+ import chardet
24
+
25
+ # --- Device selection ---
26
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
27
+
28
+ # --- Load Wolof MarianMT model from HF hub ---
29
+ @st.cache_resource
30
+ def load_wolof_model():
31
+ model_name = "LocaleNLP/eng_wolof"
32
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
33
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
34
+ translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)
35
+ return translator
36
+
37
+ @st.cache_resource
38
+ def load_whisper_model():
39
+ return whisper.load_model("base")
40
+
41
+ def transcribe_audio(audio_path):
42
+ whisper_model = load_whisper_model()
43
+ return whisper_model.transcribe(audio_path)["text"]
44
+
45
+ def translate(text):
46
+ translator = load_wolof_model()
47
+ lang_tag = ">>wol<<"
48
+
49
+ paragraphs = text.split("\n")
50
+ translated_output = []
51
+
52
+ with torch.no_grad():
53
+ for para in paragraphs:
54
+ if not para.strip():
55
+ translated_output.append("")
56
+ continue
57
+ sentences = [s.strip() for s in para.split('. ') if s.strip()]
58
+ formatted = [f"{lang_tag} {s}" for s in sentences]
59
+
60
+ results = translator(formatted,
61
+ max_length=5000,
62
+ num_beams=5,
63
+ early_stopping=True,
64
+ no_repeat_ngram_size=3,
65
+ repetition_penalty=1.5,
66
+ length_penalty=1.2)
67
+ translated_sentences = [r['translation_text'].capitalize() for r in results]
68
+ translated_output.append('. '.join(translated_sentences))
69
+
70
+ return "\n".join(translated_output)
71
+
72
+ def extract_text_from_file(uploaded_file):
73
+ file_type = uploaded_file.name.split('.')[-1].lower()
74
+ content = uploaded_file.read()
75
+
76
+ if file_type == "pdf":
77
+ with fitz.open(stream=content, filetype="pdf") as doc:
78
+ return "\n".join([page.get_text() for page in doc])
79
+ elif file_type == "docx":
80
+ doc = docx.Document(uploaded_file)
81
+ return "\n".join([para.text for para in doc.paragraphs])
82
+ else:
83
+ encoding = chardet.detect(content)['encoding']
84
+ if encoding:
85
+ content = content.decode(encoding, errors='ignore')
86
+ if file_type in ("html", "htm"):
87
+ soup = BeautifulSoup(content, "html.parser")
88
+ return soup.get_text()
89
+ elif file_type == "md":
90
+ html = markdown2.markdown(content)
91
+ soup = BeautifulSoup(html, "html.parser")
92
+ return soup.get_text()
93
+ elif file_type == "srt":
94
+ return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
95
+ elif file_type in ("txt", "text"):
96
+ return content
97
+ else:
98
+ raise ValueError("Unsupported file type")
99
+
100
+ def main():
101
+ st.set_page_config(page_title="LocaleNLP English-to-Wolof Translator", layout="wide", initial_sidebar_state="expanded")
102
+
103
+ with st.sidebar:
104
+ st.image("localenpl5.jpeg", use_container_width=True)
105
+ st.markdown("""
106
+ <h3 style='text-align: left; color: #4B8BBE;'>🌐 Wolof Translation</h3>
107
+ This app translates English text to Wolof (Senegal) using a custom MarianMT model hosted on Hugging Face.
108
+ """, unsafe_allow_html=True)
109
+
110
+ st.markdown("<h4 style='text-align: center; color: #306998;'>Translate English to Wolof</h4>", unsafe_allow_html=True)
111
+
112
+ col1, col2 = st.columns(2)
113
+ with col1:
114
+ input_mode = st.selectbox("Select input mode:", ("Text", "Audio", "File"))
115
+ st.markdown("<hr>", unsafe_allow_html=True)
116
+ with col2:
117
+ target_lang = "Wolof (Senegal)"
118
+ st.markdown(f"<p><b>Target language:</b> {target_lang}</p>", unsafe_allow_html=True)
119
+ st.markdown("<hr>", unsafe_allow_html=True)
120
+
121
+ col3, col4 = st.columns(2)
122
+ with col3:
123
+ input_text = ""
124
+ if input_mode == "Text":
125
+ input_text = st.text_area("✏️ Enter English text:", height=250)
126
+ elif input_mode == "Audio":
127
+ audio_file = st.file_uploader("πŸ”Š Upload audio (.wav, .mp3, .m4a)", type=["wav", "mp3", "m4a"])
128
+ if audio_file:
129
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{audio_file.type.split('/')[-1]}") as tmp:
130
+ tmp.write(audio_file.read())
131
+ tmp_path = tmp.name
132
+ with st.spinner("Transcribing..."):
133
+ input_text = transcribe_audio(tmp_path)
134
+ os.remove(tmp_path)
135
+ st.text_area("πŸ“ Transcribed Text:", value=input_text, height=150)
136
+ elif input_mode == "File":
137
+ uploaded_file = st.file_uploader("πŸ“„ Upload document (PDF, Word, HTML, Markdown, SRT, TXT)",
138
+ type=["pdf", "docx", "html", "htm", "md", "srt", "txt"])
139
+ if uploaded_file:
140
+ try:
141
+ input_text = extract_text_from_file(uploaded_file)
142
+ st.text_area("πŸ“ƒ Extracted Text:", value=input_text, height=200)
143
+ except Exception as e:
144
+ st.error(f"Error extracting text: {str(e)}")
145
+
146
+ with col4:
147
+ if input_text:
148
+ with st.spinner("Translating..."):
149
+ translated_text = translate(input_text)
150
+ st.write(f"Output in {target_lang}")
151
+ st.success(translated_text)
152
+
153
+ st.download_button(
154
+ label="πŸ’Ύ Download Translation",
155
+ data=translated_text,
156
+ file_name=f"translated_{target_lang.replace(' ', '_').lower()}.txt",
157
+ mime="text/plain"
158
+ )
159
+ else:
160
+ st.info("Translation will appear here.")
161
+
162
+ st.markdown("""
163
+ <hr>
164
+ <div style='text-align: center; color: #4B8BBE; font-size: 0.9rem'>
165
+ LocaleNLP Β© 2025 β€’ Empowering communities through AI and language
166
+ </div>""", unsafe_allow_html=True)
167
+
168
+ if __name__ == "__main__":
169
+ main()
localenpl5.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ transformers
5
+ torch
6
+ whisper
7
+ nltk
8
+ PyMuPDF
9
+ python-docx
10
+ beautifulsoup4
11
+ markdown2
12
+ chardet