tnt306 commited on
Commit
b77991a
·
1 Parent(s): 545de03

Initial Version (Qwen2.5-7B-Instruct-1M-q4_k_m Original & Finetuned; Chat Web UI)

Browse files
Files changed (10) hide show
  1. .gitattributes +3 -0
  2. Dockerfile +44 -0
  3. build_docker.sh +2 -0
  4. prompt_parsing.py +211 -0
  5. run_docker.sh +1 -0
  6. run_local_llm_server.py +19 -0
  7. server_ui.py +170 -0
  8. test.py +11 -0
  9. total_run.sh +3 -0
  10. utils.py +89 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf filter=lfs diff=lfs merge=lfs -text
37
+ *.so filter=lfs diff=lfs merge=lfs -text
38
+ binaries/bin/llama-server filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04
2
+
3
+ RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
4
+ RUN wget https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh -O ~/Anaconda3-2024.10-1-Linux-x86_64.sh
5
+ RUN bash ~/Anaconda3-2024.10-1-Linux-x86_64.sh -b -p /root/anaconda3
6
+ ENV PATH="/root/anaconda3/bin:$PATH"
7
+ RUN rm ~/Anaconda3-2024.10-1-Linux-x86_64.sh
8
+
9
+ RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgomp1
10
+
11
+ WORKDIR /LLM-App
12
+
13
+ COPY binaries ./binaries
14
+ COPY Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned.gguf .
15
+ COPY Qwen2.5-7B-Instruct-1M-q4_k_m-Original.gguf .
16
+ COPY run_local_llm_server.py .
17
+
18
+ RUN chmod +x ./binaries/bin/llama-server
19
+
20
+ WORKDIR /root/anaconda3
21
+ RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
22
+
23
+ WORKDIR /LLM-App
24
+
25
+ RUN pip3 install langchain-core
26
+ RUN pip3 install pydantic-ai
27
+ RUN pip3 install langgraph
28
+ RUN pip3 install langchain-google-genai
29
+ RUN pip3 install langchain-openai
30
+
31
+ RUN apt-get -y install graphviz graphviz-dev
32
+ RUN apt-get -y install gcc
33
+ RUN pip3 install pygraphviz
34
+
35
+ COPY utils.py .
36
+ COPY test.py .
37
+
38
+ COPY prompt_parsing.py .
39
+ COPY server_ui.py .
40
+
41
+ COPY total_run.sh .
42
+ RUN chmod +x total_run.sh
43
+
44
+ CMD ["/usr/bin/bash", "./total_run.sh"]
build_docker.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ docker build -t finetuned-llm-demo-app-huggingface-docker .
2
+ docker rmi -f $(sudo docker images -f "dangling=true" -q)
prompt_parsing.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from typing import Dict, List, Literal
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from os import getenv
6
+ from time import sleep
7
+ from enum import Enum
8
+ import functools
9
+ from langgraph.graph import StateGraph, START, END
10
+ from langgraph.checkpoint.memory import MemorySaver
11
+ from typing import TypedDict, Annotated, List, Any
12
+ from pydantic_ai import Agent
13
+ from pydantic_ai.models.openai import OpenAIModel
14
+ from pydantic_ai.models.gemini import GeminiModel
15
+ from pydantic_ai.messages import ModelMessage, ModelMessagesTypeAdapter
16
+ from langgraph.types import StreamWriter
17
+ from pydantic_ai.providers.openai import OpenAIProvider
18
+ from pydantic_ai.providers.google_gla import GoogleGLAProvider
19
+ from langchain_core.runnables.config import RunnableConfig
20
+
21
+
22
+ class ModelNames(Enum):
23
+ Qwen25_7B_Instruct_1M_q4_k_m_Finetuned = "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned"
24
+ Qwen25_7B_Instruct_1M_q4_k_m_Original = "Qwen2.5-7B-Instruct-1M-q4_k_m-Original"
25
+
26
+
27
+ class LLMWaitTime(Enum):
28
+ """
29
+ OpenRouter allows 20 requests per minute, 200 requests per day for free tier, AKA. 3 seconds per request. (https://openrouter.ai/docs/api-reference/limits)
30
+ Gemini 2.0 Flash: RPM: 15; RPD: 1,500 ➔ AKA. 4 seconds per request. (https://ai.google.dev/gemini-api/docs/rate-limits#free-tier)
31
+ """
32
+
33
+ OpenRouter_DeepSeek_R1 = 3
34
+ OpenRouter_Qwen25_72B_Instruct = 3
35
+ OpenRouter_Llama33_70B_Instruct = 3
36
+ Google_Gemini_20_Flash = 4
37
+
38
+
39
+ LOCAL_LLM_URL = "http://127.0.0.1"
40
+ # LOCAL_LLM_URL = "http://192.168.178.45"
41
+ LOCAL_LLM_API = "36b81180-08d2-4b73-826c-3203e0698c06"
42
+
43
+ prompt_arxiv_qa = ChatPromptTemplate(
44
+ [
45
+ ("system", "You are a helpful Research bot."),
46
+ (
47
+ "human",
48
+ 'Below is the title and abstract of a paper from arXiv. Create {num_questions} pairs of questions and corresponding answers, based on the title and abstract. Avoid using abbreviations and acronyms. Questions start with the string "Question:". Answers start with the string "Answer:". Include only the list and nothing else.\n\nTitle: {title}\n\nAbstract: {abstract}',
49
+ ),
50
+ ]
51
+ )
52
+
53
+ prompt_arxiv_summary = ChatPromptTemplate(
54
+ [
55
+ ("system", "You are a helpful Research bot."),
56
+ (
57
+ "human",
58
+ "Below is the title and abstract of a paper from arXiv. Summarize it, and additionally include other relevant information to help users understand the paper better.\n\nTitle: {title}\n\nAbstract: {abstract}",
59
+ ),
60
+ ]
61
+ )
62
+
63
+ prompt_paraphrase = ChatPromptTemplate(
64
+ [
65
+ ("system", "You are a helpful Research bot. {further_instruction}"),
66
+ ("human", "Paraphrase the following {thing} below:\n\n{thing}:{sentence}"),
67
+ ]
68
+ )
69
+
70
+ # oneshot_deepseek_llm = ChatOpenAI(
71
+ # openai_api_key=getenv("OPENROUTER_API_KEY"),
72
+ # openai_api_base="https://openrouter.ai/api/v1",
73
+ # model_name="deepseek/deepseek-r1:free",
74
+ # )
75
+ # oneshot_gemini_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
76
+
77
+ # oneshot_qwen_llm = ChatOpenAI(
78
+ # openai_api_key=getenv("OPENROUTER_API_KEY"),
79
+ # openai_api_base="https://openrouter.ai/api/v1",
80
+ # model_name="qwen/qwen-2.5-72b-instruct:free",
81
+ # )
82
+
83
+ # oneshot_llama_llm = ChatOpenAI(
84
+ # openai_api_key=getenv("OPENROUTER_API_KEY2"),
85
+ # openai_api_base="https://openrouter.ai/api/v1",
86
+ # model_name="meta-llama/llama-3.3-70b-instruct:free",
87
+ # )
88
+
89
+
90
+ def parse_arxiv_qa_prompt_output(output: str) -> List[Dict]:
91
+ lines = output.split("\n")
92
+ lst_qa = []
93
+ question = ""
94
+ answer = ""
95
+
96
+ for line in lines:
97
+ line = line.strip()
98
+ if len(line) > 0:
99
+ if line.startswith("Question:"):
100
+ question = line[line.index(" ") + 1 :]
101
+ elif line.startswith("Answer:"):
102
+ answer = line[line.index(" ") + 1 :]
103
+ lst_qa.append({"question": question, "answer": answer})
104
+ question = ""
105
+ answer = ""
106
+ else:
107
+ print(f"Error: [{line}] not question nor answer")
108
+
109
+ return lst_qa
110
+
111
+
112
+ def llm_wait_after_request(provider: LLMWaitTime):
113
+ def decorator(some_function):
114
+ @functools.wraps(some_function)
115
+ def wrapper(*args, **kwargs):
116
+ res = some_function(*args, **kwargs)
117
+ sleep(provider.value)
118
+ return res
119
+
120
+ return wrapper
121
+
122
+ return decorator
123
+
124
+
125
+ ########################################################
126
+ # Define state schema
127
+ class AgentState(TypedDict):
128
+ latest_user_message: str
129
+ messages: Annotated[List[bytes], lambda x, y: x + y]
130
+
131
+
132
+ reasoner_system_prompt = 'You are a helpful Artificial Intelligence (AI) Research bot, with expertise on Large Language Model (LLM). You have especially deep knowledge about the Research Paper "Byte Latent Transformer (BLT): Patches Scale Better Than Tokens". Users can ask you questions, and you will provide the corresponding answers. If the questions are related to Byte Latent Transformer (BLT), the answers must be in a detailed manner, and primarily come from the information in the Research Paper, additionally with your general knowledge. The goal is to help users understand fully.'
133
+
134
+ reasoner_agents = {
135
+ ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value: Agent(
136
+ OpenAIModel(
137
+ ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value,
138
+ provider=OpenAIProvider(
139
+ api_key=LOCAL_LLM_API,
140
+ base_url=f"{LOCAL_LLM_URL}:8081/v1",
141
+ ),
142
+ ),
143
+ system_prompt=reasoner_system_prompt,
144
+ retries=3,
145
+ ),
146
+ ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Original.value: Agent(
147
+ OpenAIModel(
148
+ ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Original.value,
149
+ provider=OpenAIProvider(
150
+ api_key=LOCAL_LLM_API,
151
+ base_url=f"{LOCAL_LLM_URL}:8080/v1",
152
+ ),
153
+ ),
154
+ system_prompt=reasoner_system_prompt,
155
+ retries=3,
156
+ ),
157
+ }
158
+
159
+ # router_agent = Agent(
160
+ # GeminiModel(
161
+ # "gemini-2.0-flash",
162
+ # provider=GoogleGLAProvider(api_key=os.getenv("GOOGLE_API_KEY")),
163
+ # ),
164
+ # system_prompt="Your job is to route the user message either to the end of the conversation or to continue the conversation.",
165
+ # retries=3,
166
+ # )
167
+
168
+
169
+ async def reasoner(state: AgentState, writer: StreamWriter, config: RunnableConfig):
170
+ print(f"reasoner(): latest_user_message = {state["latest_user_message"]}")
171
+
172
+ model = config["configurable"]["model"] # type: ignore
173
+ reasoner_agent = reasoner_agents[model]
174
+
175
+ print(f"reasoner(): chosen model = {model}")
176
+
177
+ # Get the message history into the format for Pydantic AI
178
+ message_history: list[ModelMessage] = []
179
+ for message_row in state["messages"]:
180
+ message_history.extend(ModelMessagesTypeAdapter.validate_json(message_row))
181
+
182
+ async with reasoner_agent.run_stream(
183
+ state["latest_user_message"], message_history=message_history
184
+ ) as result:
185
+ async for chunk in result.stream_text(delta=True):
186
+ writer(chunk)
187
+ """MyNote:
188
+ The "new_messages_json" includes the latest user message and the AI's response.
189
+ If first time, it will include the system prompt as well.
190
+ """
191
+ return {"messages": [result.new_messages_json()]}
192
+
193
+
194
+ def generate_agentic_flow():
195
+ builder = StateGraph(AgentState)
196
+
197
+ # Add nodes
198
+ builder.add_node("reasoner", reasoner)
199
+
200
+ # Set edges
201
+ builder.add_edge(START, "reasoner")
202
+ builder.add_edge("reasoner", END)
203
+
204
+ # Maintain memory across different graph runs. ➔ Must also use "thread_id" in RunnableConfig/"configurable".
205
+ memory = MemorySaver()
206
+ agentic_flow = builder.compile(checkpointer=memory)
207
+
208
+ # For debug
209
+ agentic_flow.get_graph().draw_png("graph.png")
210
+
211
+ return agentic_flow
run_docker.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ docker run --rm -p 8501:8501 --gpus all finetuned-llm-demo-app-huggingface-docker
run_local_llm_server.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import restart_exec
2
+ import os
3
+
4
+ if __name__ == "__main__":
5
+ print("*** START ***")
6
+
7
+ current_script_folder = os.path.dirname(os.path.realpath(__file__))
8
+
9
+ models = [
10
+ "Qwen2.5-7B-Instruct-1M-q4_k_m-Original",
11
+ "Qwen2.5-7B-Instruct-1M-q4_k_m-Finetuned",
12
+ ]
13
+
14
+ for idx, model_name in enumerate(models):
15
+ llama_server_path = f'{current_script_folder}/binaries/bin/llama-server --temp 0.5 --top-k 40 --top-p 0.95 --min-p 0.05 --repeat-last-n 128 --repeat-penalty 1.1 --presence-penalty 0.0 --frequency-penalty 0.0 --dry-multiplier 0.0 --xtc-probability 0.0 --typical 1.0 --n-predict -1 --ctx-size 4096 --n-gpu-layers 100 --seed -1 --api-key 36b81180-08d2-4b73-826c-3203e0698c06 --no-webui --host 0.0.0.0 --port 808{idx} --log-timestamps --log-colors --alias "{model_name}" -m "{current_script_folder}/{model_name}.gguf"'
16
+
17
+ restart_exec(llama_server_path, "server is listening on")
18
+
19
+ print("*** END ***")
server_ui.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import asyncio
3
+ import uuid
4
+ from utils import get_current_time
5
+ from prompt_parsing import generate_agentic_flow, ModelNames
6
+ from langchain_core.runnables import RunnableConfig
7
+ import pandas as pd
8
+ import json
9
+ import ast
10
+
11
+
12
+ # Cache that return global resources (e.g. database connections, ML models).
13
+ # Cached objects are shared across all users, sessions, and reruns.
14
+ @st.cache_resource
15
+ def get_global_server_session_id():
16
+ return str(uuid.uuid4())
17
+
18
+
19
+ # Prepare dât unique to this current session.
20
+ def initialize_session_if_not(force=False):
21
+ if force or "local_session_id" not in st.session_state:
22
+ st.session_state.local_session_id = str(uuid.uuid4())
23
+ if force or "local_agentic_flow" not in st.session_state:
24
+ st.session_state.local_agentic_flow = generate_agentic_flow()
25
+ if force or "local_config" not in st.session_state:
26
+ current_chosen_model = (
27
+ st.session_state.get("local_config", {})
28
+ .get("configurable", {})
29
+ .get("model")
30
+ )
31
+
32
+ to_assign_model = (
33
+ ModelNames.Qwen25_7B_Instruct_1M_q4_k_m_Finetuned.value
34
+ if current_chosen_model is None
35
+ else current_chosen_model
36
+ )
37
+
38
+ # The memory thread_id for this local session.
39
+ st.session_state.local_config = RunnableConfig(
40
+ configurable={
41
+ "thread_id": st.session_state.local_session_id,
42
+ "model": to_assign_model,
43
+ }
44
+ ) # The memory thread_id for this local session.
45
+ if force or "messages" not in st.session_state:
46
+ st.session_state.messages = []
47
+
48
+
49
+ async def run_agent_with_streaming(user_input: str):
50
+ # The "config" parameter allows Graph with different configurations (system message or LLM to use).
51
+ async for msg in st.session_state.local_agentic_flow.astream(
52
+ {"latest_user_message": user_input},
53
+ st.session_state.local_config,
54
+ stream_mode="custom",
55
+ ):
56
+ yield msg
57
+
58
+
59
+ def reset_conversation():
60
+ initialize_session_if_not(True)
61
+
62
+
63
+ def choose_model():
64
+ st.session_state.local_config["configurable"][
65
+ "model"
66
+ ] = st.session_state.chosen_model
67
+
68
+
69
+ @st.dialog("st.session_state:")
70
+ def display_session_state():
71
+ ss_str = str(st.session_state)
72
+ st.write(ss_str)
73
+
74
+
75
+ async def main():
76
+ st.set_page_config(page_title="Finetuned LLM Demo", layout="wide")
77
+ st.title("Finetuned LLM Demo")
78
+ st.markdown(
79
+ """
80
+ <style>
81
+ header {background-color: transparent !important;}
82
+ .block-container {
83
+ padding-top: 0px;
84
+ }
85
+ .reportview-container {
86
+ margin-top: -2em;
87
+ }
88
+ #MainMenu {visibility: hidden;}
89
+ .stDeployButton {display:none;}
90
+ footer {visibility: hidden;}
91
+ #stDecoration {display:none;}
92
+ </style>
93
+ """,
94
+ unsafe_allow_html=True,
95
+ )
96
+
97
+ with st.expander("System", expanded=False):
98
+ df = pd.DataFrame.from_records(
99
+ [
100
+ {
101
+ "Global Server Session ID": get_global_server_session_id(),
102
+ "Local Session ID": st.session_state.local_session_id,
103
+ "Page Refreshed at": get_current_time(),
104
+ }
105
+ ]
106
+ )
107
+ st.table(df)
108
+
109
+ col1, col2 = st.columns(2)
110
+
111
+ with col1:
112
+ st.selectbox(
113
+ label="Model:",
114
+ options=tuple([e.value for e in ModelNames]),
115
+ index=0,
116
+ key="chosen_model",
117
+ on_change=choose_model,
118
+ label_visibility="collapsed",
119
+ )
120
+
121
+ with col2:
122
+ st.button(label="st.session_state", on_click=display_session_state)
123
+
124
+ st.write(
125
+ 'Ask me about LLM in particular and AI in general. BTW, I\'m an expert on "Byte Latent Transformer". Example: What are the benefits of Byte Latent Transformer?'
126
+ )
127
+
128
+ st.button("New Chat", on_click=reset_conversation)
129
+
130
+ # Display chat messages from history on app rerun
131
+ for message in st.session_state.messages:
132
+ message_type = message["type"]
133
+ if message_type in ["human", "ai", "system"]:
134
+ with st.chat_message(message_type):
135
+ st.markdown(message["content"])
136
+
137
+ # Chat input for the user
138
+ user_input = st.chat_input("What question is in your mind right now?")
139
+
140
+ if user_input:
141
+ # We append a new request to the conversation explicitly
142
+ st.session_state.messages.append({"type": "human", "content": user_input})
143
+
144
+ # Display user prompt in the UI
145
+ with st.chat_message("user"):
146
+ st.markdown(user_input)
147
+
148
+ # Display assistant response in chat message container
149
+ response_content = ""
150
+ with st.chat_message("assistant"):
151
+ message_placeholder = st.empty() # Placeholder for updating the message
152
+ # Run the async generator to fetch responses
153
+ async for chunk in run_agent_with_streaming(user_input):
154
+ response_content += chunk
155
+ # Update the placeholder with the current response content
156
+ message_placeholder.markdown(response_content)
157
+
158
+ st.session_state.messages.append({"type": "ai", "content": response_content})
159
+
160
+
161
+ if __name__ == "__main__":
162
+ print("*** START ***")
163
+ try:
164
+ initialize_session_if_not(False)
165
+ asyncio.run(main())
166
+ except Exception as ex:
167
+ print("*** ERROR ***")
168
+ print(ex)
169
+
170
+ print("*** END ***")
test.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sys
3
+
4
+ print(f"sys.version: {sys.version}")
5
+ print(f"torch.__version__: {torch.__version__}")
6
+ print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
7
+ print(f"torch.cuda.device_count(): {torch.cuda.device_count()}")
8
+ print(f"torch.cuda.current_device(): {torch.cuda.current_device()}")
9
+ print(f"torch.cuda.device(0): {torch.cuda.device(0)}")
10
+ print(f"torch.cuda.get_device_name(0): {torch.cuda.get_device_name(0)}")
11
+ print(f"torch.version.cuda: {torch.version.cuda}")
total_run.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+ python run_local_llm_server.py
3
+ streamlit run server_ui.py --server.port 7860
utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timezone
2
+ import socket
3
+ import shlex
4
+ import subprocess
5
+ import os
6
+ import psutil
7
+
8
+
9
+ def get_current_time():
10
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S")
11
+
12
+
13
+ def check_server(address, port):
14
+ # Create a TCP socket
15
+ s = socket.socket()
16
+ print(f"Attempting to connect to {address} on port {port}")
17
+ try:
18
+ s.connect((address, port))
19
+ print(f"Connected to {address} on port {port}")
20
+ return True
21
+ except socket.error as e:
22
+ print(f"Connection to {address} on port {port} failed:{e}")
23
+ return False
24
+ finally:
25
+ s.close()
26
+
27
+
28
+ ###############################################################################
29
+ invalid_chars = frozenset('<>:"/\\|?*')
30
+
31
+
32
+ def get_valid_filename(filename: str):
33
+ return "".join("-" if c in invalid_chars else c for c in filename)
34
+
35
+
36
+ ###############################################################################
37
+ # Example usage: restart_exec(llama_server_path, "server is listening on")
38
+ def restart_exec(shell_exec_path_with_args, success_text):
39
+ exec_path = shlex.split(shell_exec_path_with_args)[
40
+ 0
41
+ ] # supposed to have absolute path!
42
+ exec_name = os.path.basename(exec_path)
43
+ exec_folder = os.path.dirname(exec_path)
44
+
45
+ # Kill all existing exec processes.
46
+ for proc in psutil.process_iter():
47
+ # check whether the process name matches
48
+ if proc.name() == exec_name:
49
+ proc.kill()
50
+
51
+ # Start a new llama-server process.
52
+ exec_proc = subprocess.Popen(
53
+ shell_exec_path_with_args,
54
+ stdout=subprocess.PIPE,
55
+ stderr=subprocess.PIPE,
56
+ text=True,
57
+ shell=True,
58
+ cwd=f"{exec_folder}",
59
+ )
60
+
61
+ os.set_blocking(exec_proc.stdout.fileno(), False)
62
+
63
+ started = False
64
+
65
+ while exec_proc.poll() is None:
66
+ output = exec_proc.stdout.readline()
67
+ error = exec_proc.stderr.readline()
68
+ current_line = (
69
+ curr_line_output
70
+ if output and len(curr_line_output := output.strip()) > 0
71
+ else ""
72
+ ) + (
73
+ curr_line_error
74
+ if error and len(curr_line_error := error.strip()) > 0
75
+ else ""
76
+ )
77
+ if len(current_line) > 0:
78
+ # print(f"Current Line: {current_line}")
79
+ if success_text in current_line:
80
+ print(
81
+ f"➔ Found the success text [{success_text}] at [{get_current_time()}]."
82
+ )
83
+ started = True
84
+ break
85
+
86
+ if not started:
87
+ print(f"Failed to start the {exec_name}. returncode: {exec_proc.returncode}")
88
+
89
+ return started