hblim commited on
Commit
e3f9537
Β·
1 Parent(s): deff88d

Added spike visualization to app

Browse files
frontend/app.py CHANGED
@@ -2,6 +2,8 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import altair as alt
 
 
5
 
6
  # Call page config BEFORE importing modules that use Streamlit commands
7
  st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
@@ -22,11 +24,8 @@ st.markdown(
22
  """
23
  **Welcome!** This page shows how Reddit's AI communities feel day-to-day.
24
 
25
- β€’ A daily pipeline grabs new posts and comments, scores their tone with a sentiment model, and saves the totals to a public HuggingFace [dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily). \n
26
- β€’ The line chart below plots *community-weighted sentiment*: each post/comment's sentiment is scaled by its upvotes so busier discussions matter more. Values run from βˆ’1 (negative) to +1 (positive). \n
27
- β€’ The table further down lets you drill into the posts that shaped the mood on a chosen date. \n\n
28
 
29
- Pick a subreddit and explore!
30
  """
31
  )
32
 
@@ -39,6 +38,14 @@ last_update_caption = get_last_updated_hf_caption()
39
  subreddits = df["subreddit"].unique()
40
  subreddit_colors = get_subreddit_colors(subreddits)
41
 
 
 
 
 
 
 
 
 
42
  # Define time format to use across all charts
43
  time_format = "%m/%d/%Y"
44
 
@@ -47,8 +54,16 @@ min_date = df["date"].min().date()
47
  max_date = df["date"].max().date()
48
 
49
  # ── Community weighted sentiment line chart for all subreddits ───────────────
50
- st.subheader("Community Weighted Sentiment by Subreddit")
51
-
 
 
 
 
 
 
 
 
52
  # Add date range selector for the time series
53
  date_range = st.date_input(
54
  "Select date range for time series",
@@ -68,6 +83,33 @@ selected_subreddit = st.selectbox(
68
  )
69
  plot_df = filtered_df[filtered_df["subreddit"] == selected_subreddit]
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  # Define hover selection for nearest point
72
  nearest = alt.selection_single(
73
  name="nearest",
@@ -77,17 +119,30 @@ nearest = alt.selection_single(
77
  empty="none"
78
  )
79
 
80
- # Base chart for DRY encoding (single subreddit, constant colour)
81
  base = alt.Chart(plot_df).encode(
82
- x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
83
- y=alt.Y("community_weighted_sentiment:Q", title="Community Weighted Sentiment")
 
 
 
 
84
  )
85
-
86
- # Determine colour for the chosen subreddit
87
- line_colour = subreddit_colors.get(selected_subreddit, "#1f77b4")
88
 
89
  # Draw line for the selected subreddit
90
- line = base.mark_line(color=line_colour)
 
 
 
 
 
 
 
 
 
 
91
 
92
  # Invisible selectors to capture hover events
93
  selectors = base.mark_point(opacity=0).add_selection(nearest)
@@ -106,16 +161,113 @@ tooltips = base.mark_rule(color="gray").encode(
106
  ]
107
  ).transform_filter(nearest)
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # Layer everything and make interactive, with title showing subreddit
110
- hover_chart = alt.layer(line, selectors, points_hover, tooltips).properties(
111
- height=300,
112
- title=alt.TitleParams(
113
- text=f"Daily Community Weighted Sentiment for {selected_subreddit}",
114
- offset=20 # adds space above the title so it is not cut off
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  )
116
- ).interactive()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- st.altair_chart(hover_chart, use_container_width=True)
119
 
120
  # ── Bar chart for post counts by subreddit (side-by-side) ────────────────────
121
  st.subheader("Daily Post Counts by Subreddit")
@@ -131,7 +283,7 @@ bar_chart = alt.Chart(df).mark_bar().encode(
131
  legend=alt.Legend(title="Subreddit")
132
  ),
133
  tooltip=["date", "subreddit", "count"]
134
- ).properties(height=300).interactive()
135
 
136
  st.altair_chart(bar_chart, use_container_width=True)
137
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import altair as alt
5
+ import yaml
6
+ from pathlib import Path
7
 
8
  # Call page config BEFORE importing modules that use Streamlit commands
9
  st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
 
24
  """
25
  **Welcome!** This page shows how Reddit's AI communities feel day-to-day.
26
 
27
+ A daily pipeline grabs new posts and comments, scores their tone with a sentiment model, and saves the results to a public HuggingFace [dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily). \n
 
 
28
 
 
29
  """
30
  )
31
 
 
38
  subreddits = df["subreddit"].unique()
39
  subreddit_colors = get_subreddit_colors(subreddits)
40
 
41
+ # Load mean/std parameters for sentiment spike bands per subreddit
42
+ params_path = Path(__file__).resolve().parent.parent / "spike_params.yaml"
43
+ try:
44
+ with params_path.open("r") as f:
45
+ spike_params = yaml.safe_load(f)
46
+ except FileNotFoundError:
47
+ spike_params = {}
48
+
49
  # Define time format to use across all charts
50
  time_format = "%m/%d/%Y"
51
 
 
54
  max_date = df["date"].max().date()
55
 
56
  # ── Community weighted sentiment line chart for all subreddits ───────────────
57
+ st.subheader("Daily Community-Weighted Sentiment")
58
+ st.markdown(
59
+ """
60
+ The line chart below plots the daily *community-weighted sentiment*, reflecting the average sentiment across all posts/comments in a subreddit community.
61
+
62
+ To calculate the community-weighted sentiment:
63
+ - First, each post or comment is assigned a sentiment score of βˆ’1 (negative) or +1 (positive)
64
+ - Then, the sentiment score is weighted by its upvotes so busier discussions matter more.
65
+ """
66
+ )
67
  # Add date range selector for the time series
68
  date_range = st.date_input(
69
  "Select date range for time series",
 
83
  )
84
  plot_df = filtered_df[filtered_df["subreddit"] == selected_subreddit]
85
 
86
+ # ── Determine shading band and dynamic y-axis domain ────────────────────────
87
+ mean_val = std_val = None
88
+ if selected_subreddit in spike_params:
89
+ mean_val = spike_params[selected_subreddit].get("mean")
90
+ std_val = spike_params[selected_subreddit].get("std")
91
+
92
+ # Calculate band limits if parameters exist
93
+ band_low = band_high = None
94
+ if mean_val is not None and std_val is not None:
95
+ band_low = mean_val - 3 * std_val
96
+ band_high = mean_val + 3 * std_val
97
+
98
+ # Determine y-axis domain based on data and (optional) band
99
+ sent_min = plot_df["community_weighted_sentiment"].min()
100
+ sent_max = plot_df["community_weighted_sentiment"].max()
101
+
102
+ if band_low is not None:
103
+ y_min = float(min(sent_min, band_low))
104
+ y_max = float(max(sent_max, band_high))
105
+ else:
106
+ y_min = float(sent_min)
107
+ y_max = float(sent_max)
108
+
109
+ # Add small padding so points are not flush with edges
110
+ padding = 0.05
111
+ y_domain = [y_min - padding, y_max + padding]
112
+
113
  # Define hover selection for nearest point
114
  nearest = alt.selection_single(
115
  name="nearest",
 
119
  empty="none"
120
  )
121
 
122
+ # Base chart with refreshed y-axis range
123
  base = alt.Chart(plot_df).encode(
124
+ x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format, labelPadding=15)),
125
+ y=alt.Y(
126
+ "community_weighted_sentiment:Q",
127
+ title="Community Weighted Sentiment",
128
+ scale=alt.Scale(domain=y_domain),
129
+ ),
130
  )
131
+ # Use a constant blue colour for all plot elements
132
+ line_colour = "#1f77b4"
 
133
 
134
  # Draw line for the selected subreddit
135
+ line = (
136
+ base.transform_calculate(legend='"daily community sentiment score"')
137
+ .mark_line(color=line_colour)
138
+ .encode(
139
+ color=alt.Color(
140
+ "legend:N",
141
+ scale=alt.Scale(domain=["daily community sentiment score", "historical 3Οƒ sentiment range", "significant sentiment outlier"], range=[line_colour, line_colour, "red"]),
142
+ legend=None # hide default legend; we will add a custom manual legend below the chart
143
+ )
144
+ )
145
+ )
146
 
147
  # Invisible selectors to capture hover events
148
  selectors = base.mark_point(opacity=0).add_selection(nearest)
 
161
  ]
162
  ).transform_filter(nearest)
163
 
164
+ # Optional shaded band (mean Β± 3Οƒ)
165
+ band = None
166
+ outliers = None
167
+ domain_labels = [
168
+ "daily community sentiment score",
169
+ "historical 3Οƒ sentiment range",
170
+ "significant sentiment outlier",
171
+ ]
172
+ domain_colors = [line_colour, line_colour, "red"]
173
+
174
+
175
+
176
+ if band_low is not None:
177
+ band_df = pd.DataFrame({
178
+ "date": [plot_df["date"].min(), plot_df["date"].max()],
179
+ "low": [band_low, band_low],
180
+ "high": [band_high, band_high],
181
+ })
182
+ band = (
183
+ alt.Chart(band_df)
184
+ .transform_calculate(legend='"historical 3Οƒ sentiment range"')
185
+ .mark_area(opacity=0.15)
186
+ .encode(
187
+ x="date:T",
188
+ y=alt.Y("low:Q", scale=alt.Scale(domain=y_domain)),
189
+ y2="high:Q",
190
+ color=alt.Color(
191
+ "legend:N",
192
+ scale=alt.Scale(domain=domain_labels, range=domain_colors),
193
+ legend=None # suppress built-in legend for band
194
+ ),
195
+ )
196
+ )
197
+
198
+ # Identify significant outliers outside the band
199
+ outlier_df = plot_df[(plot_df["community_weighted_sentiment"] < band_low) |
200
+ (plot_df["community_weighted_sentiment"] > band_high)].copy()
201
+ if not outlier_df.empty:
202
+ outliers = (
203
+ alt.Chart(outlier_df)
204
+ .transform_calculate(legend='"significant sentiment outlier"')
205
+ .mark_point(shape="circle", size=100, fill="white", stroke="red", strokeWidth=2)
206
+ .encode(
207
+ x="date:T",
208
+ y="community_weighted_sentiment:Q",
209
+ color=alt.Color(
210
+ "legend:N",
211
+ scale=alt.Scale(domain=domain_labels, range=domain_colors),
212
+ legend=None # suppress built-in legend for outlier
213
+ ),
214
+ )
215
+ )
216
+
217
  # Layer everything and make interactive, with title showing subreddit
218
+ layers = [line, selectors, points_hover, tooltips]
219
+ if band is not None:
220
+ layers.insert(0, band) # draw band behind the line
221
+ if outliers is not None:
222
+ layers.append(outliers)
223
+
224
+ hover_chart = alt.layer(*layers).properties(
225
+ height=400, # increased height for more spacious plot area
226
+ ).interactive(bind_y=False)
227
+
228
+ # ── Manual legend (two rows) ───────────────────────────────────────────────
229
+ legend_df = pd.DataFrame({
230
+ "row": [0, 1],
231
+ "label": ["significant sentiment outlier", "historical 3Οƒ sentiment range"],
232
+ "stroke": ["red", "lightblue"], # outline colour
233
+ "fill": ["white", "lightblue"], # interior fill (blue only for historical band)
234
+ "shape": ["circle", "square"],
235
+ })
236
+
237
+ legend_points = (
238
+ alt.Chart(legend_df)
239
+ .mark_point(size=100, filled=True)
240
+ .encode(
241
+ y=alt.Y("row:O", axis=None),
242
+ x=alt.value(0),
243
+ shape=alt.Shape("shape:N", legend=None),
244
+ stroke=alt.Stroke("stroke:N", scale=None, legend=None),
245
+ fill=alt.Fill("fill:N", scale=None, legend=None),
246
  )
247
+ )
248
+
249
+ legend_text = (
250
+ alt.Chart(legend_df)
251
+ .mark_text(align="left", baseline="middle", dx=15)
252
+ .encode(
253
+ y="row:O",
254
+ x=alt.value(0),
255
+ text="label:N",
256
+ )
257
+ )
258
+
259
+ manual_legend = (
260
+ legend_points + legend_text
261
+ ).properties(height=60)
262
+
263
+ # Concatenate chart and manual legend vertically
264
+ final_chart = alt.vconcat(
265
+ manual_legend,
266
+ hover_chart,
267
+ spacing=0
268
+ ).configure_view(strokeWidth=0)
269
 
270
+ st.altair_chart(final_chart, use_container_width=True)
271
 
272
  # ── Bar chart for post counts by subreddit (side-by-side) ────────────────────
273
  st.subheader("Daily Post Counts by Subreddit")
 
283
  legend=alt.Legend(title="Subreddit")
284
  ),
285
  tooltip=["date", "subreddit", "count"]
286
+ ).properties(height=400).interactive()
287
 
288
  st.altair_chart(bar_chart, use_container_width=True)
289
 
notebooks/spike_detection.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
spike_params.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # spike_params.yaml
2
+ LocalLLaMA:
3
+ mean: -0.4854
4
+ std: 0.0696
5
+ OpenAI:
6
+ mean: -0.4937
7
+ std: 0.0714
8
+ artificial:
9
+ mean: -0.4735
10
+ std: 0.1389
11
+ singularity:
12
+ mean: -0.4299
13
+ std: 0.0811
subreddit_daily_summary.csv CHANGED
@@ -211,3 +211,14 @@ date,subreddit,mean_sentiment,community_weighted_sentiment,count
211
  2025-06-22,OpenAI,-0.3846,-0.4019,130
212
  2025-06-22,artificial,-0.28,-0.4088,75
213
  2025-06-22,singularity,-0.28,-0.2504,125
 
 
 
 
 
 
 
 
 
 
 
 
211
  2025-06-22,OpenAI,-0.3846,-0.4019,130
212
  2025-06-22,artificial,-0.28,-0.4088,75
213
  2025-06-22,singularity,-0.28,-0.2504,125
214
+ 2025-06-23,OpenAI,-0.4497,-0.4646,149
215
+ 2025-06-23,artificial,-0.4947,-0.4533,95
216
+ 2025-06-23,singularity,-0.4581,-0.5307,155
217
+ 2025-06-24,LocalLLaMA,-0.2587,-0.2431,143
218
+ 2025-06-24,OpenAI,-0.566,-0.5149,212
219
+ 2025-06-24,artificial,-0.4783,-0.542,92
220
+ 2025-06-24,singularity,-0.4091,-0.4386,264
221
+ 2025-06-25,LocalLLaMA,-0.4489,-0.3935,421
222
+ 2025-06-25,OpenAI,-0.4486,-0.4325,185
223
+ 2025-06-25,artificial,-0.5814,-0.5666,86
224
+ 2025-06-25,singularity,-0.4744,-0.4534,312