Spaces:

42Cummer
/

WMB2Backened

Running

App Files Files Community

42Cummer commited on 29 days ago

Commit

0170ac5

verified ·

1 Parent(s): cdcab2c

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
Dockerfile +29 -0
README.md +2 -11
api/bus_cache.py +142 -0
api/update_static.py +59 -0
api/utils.py +30 -0
requirements.txt +7 -0
src/app.py +321 -0
src/db_manager.py +145 -0
src/ttc_gtfs.duckdb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/ttc_gtfs.duckdb filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Use a slim Python image for a smaller footprint
+FROM python:3.11-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PORT=7860
+# Create a non-root user (Hugging Face requirement)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy requirements first to leverage Docker cache
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your code and the database
+# Ensure ttc_gtfs.duckdb is in your root folder
+COPY --chown=user . .
+# Hugging Face Spaces expects port 7860
+EXPOSE 7860
+# Run uvicorn
+CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,2 @@
----
-title: WMB2Backened
-emoji: 📚
-colorFrom: green
-colorTo: pink
-sdk: docker
-pinned: false
-short_description: Backend for WheresMyBus2
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # WMB2Backend
2	+ Updated backend for WheresMyBus2.0

api/bus_cache.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import time
+import httpx # type: ignore
+import asyncio # type: ignore
+from google.transit import gtfs_realtime_pb2 # type: ignore
+from dotenv import load_dotenv # type: ignore
+import os # type: ignore
+load_dotenv()
+TTC_VEHICLES_URL = os.getenv("GTFS_RT_URL")
+TTC_TRIPS_URL = os.getenv("GTFS_DELAY_URL")
+# Support both GTFS_ALERT_URL and GTFS_ALERTS_URL for compatibility
+TTC_ALERTS_URL = os.getenv("GTFS_ALERTS_URL") or os.getenv("GTFS_ALERT_URL")
+if not TTC_VEHICLES_URL:
+    raise ValueError("GTFS_RT_URL is not set")
+if not TTC_TRIPS_URL:
+    raise ValueError("GTFS_DELAY_URL is not set")
+if not TTC_ALERTS_URL:
+    raise ValueError("GTFS_ALERTS_URL or GTFS_ALERT_URL is not set")
+CAUSE_MAP = {1: "Weather", 2: "Holiday", 4: "Accident", 7: "Technical Problem", 11: "Police Activity"}
+EFFECT_MAP = {1: "No Service", 3: "Significant Delays", 4: "Detour", 8: "Unknown Effect"}
+class AsyncBusCache:
+    def __init__(self, ttl=20):
+        self.ttl = ttl
+        self._data = None
+        self._last_updated = 0
+    async def get_data(self):
+        if self._data and (time.time() - self._last_updated) < self.ttl:
+            return self._data
+        return await self._refresh()
+    async def _refresh(self):
+        try:
+            async with httpx.AsyncClient() as client:
+                # 1. Fetch ALL feeds at once
+                v_res, t_res, a_res = await asyncio.gather(
+                    client.get(TTC_VEHICLES_URL, timeout=10),
+                    client.get(TTC_TRIPS_URL, timeout=10),
+                    client.get(TTC_ALERTS_URL, timeout=10)  # The new alerts feed
+                )
+            # 2. Parse Predictions (Store ALL future stops)
+            t_feed = gtfs_realtime_pb2.FeedMessage()
+            t_feed.ParseFromString(t_res.content)
+            # Map: { "trip_id": { "stop_id_1": time, "stop_id_2": time, ... } }
+            prediction_map = {}
+            for entity in t_feed.entity:
+                if entity.HasField('trip_update'):
+                    tu = entity.trip_update
+                    trip_id = str(tu.trip.trip_id)
+                    # Store every stop in the remainder of the trip
+                    prediction_map[trip_id] = {
+                        str(stu.stop_id): (stu.departure.time if stu.HasField('departure') else stu.arrival.time)
+                        for stu in tu.stop_time_update
+                    }
+            # 3. Parse Vehicle Positions
+            v_feed = gtfs_realtime_pb2.FeedMessage()
+            v_feed.ParseFromString(v_res.content)
+            processed_buses = []
+            for entity in v_feed.entity:
+                if entity.HasField('vehicle'):
+                    v = entity.vehicle
+                    t_id = str(v.trip.trip_id)
+                    # Get all predictions for this trip
+                    trip_predictions = prediction_map.get(t_id, {})
+                    # Get the first stop (next stop) for backward compatibility
+                    next_stop_id = None
+                    predicted_time = None
+                    if trip_predictions:
+                        # Get the first stop in the predictions (sorted by time)
+                        sorted_stops = sorted(trip_predictions.items(), key=lambda x: x[1])
+                        if sorted_stops:
+                            next_stop_id, predicted_time = sorted_stops[0]
+                    processed_buses.append({
+                        "id": v.vehicle.id,
+                        "route": v.trip.route_id,
+                        "trip_id": t_id,
+                        "lat": round(v.position.latitude, 6),
+                        "lon": round(v.position.longitude, 6),
+                        "occupancy": v.occupancy_status,
+                        "next_stop_id": next_stop_id,
+                        "predicted_time": predicted_time,
+                        "predictions": trip_predictions  # Store all predictions
+                    })
+            # 4. Parse Alerts
+            a_feed = gtfs_realtime_pb2.FeedMessage()
+            a_feed.ParseFromString(a_res.content)
+            # Mapping: { "route_id": [ {header, description, effect}, ... ] }
+            route_alerts = {}
+            for entity in a_feed.entity:
+                if entity.HasField('alert'):
+                    alert = entity.alert
+                    # Extract English translations
+                    header = next((t.text for t in alert.header_text.translation if t.language == "en"), "No Header")
+                    description = next((t.text for t in alert.description_text.translation if t.language == "en"), "")
+                    # Map cause and effect codes to human-readable strings
+                    cause_code = int(alert.cause) if alert.HasField('cause') else None
+                    effect_code = int(alert.effect) if alert.HasField('effect') else None
+                    alert_payload = {
+                        "header": header,
+                        "description": description,
+                        "cause": CAUSE_MAP.get(cause_code, "Unknown") if cause_code is not None else "Unknown",
+                        "effect": EFFECT_MAP.get(effect_code, "Unknown") if effect_code is not None else "Unknown",
+                        "severity": "HIGH" if effect_code == 1 else "MEDIUM"
+                    }
+                    # Link alert to every route it mentions
+                    for ie in alert.informed_entity:
+                        if ie.HasField('route_id'):
+                            rid = str(ie.route_id)
+                            if rid not in route_alerts:
+                                route_alerts[rid] = []
+                            route_alerts[rid].append(alert_payload)
+            self._data = {
+                "vehicles": processed_buses,
+                "predictions": prediction_map,
+                "alerts": route_alerts  # Add this to your cache data
+            }
+            self._last_updated = time.time()
+            print(f"--- Cache Refreshed: {len(processed_buses)} buses, {len(route_alerts)} routes with alerts ---")
+            return self._data
+        except Exception as e:
+            print(f"Async fetch failed: {e}")
+            return self._data if self._data else []

api/update_static.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import requests # type: ignore
+import zipfile # type: ignore
+import io
+import os # type: ignore
+import shutil # type: ignore
+from pathlib import Path # type: ignore
+# Toronto Open Data CKAN API Constants
+CKAN_BASE_URL = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
+PACKAGE_ID = "merged-gtfs-ttc-routes-and-schedules"
+STATIC_DIR = str(Path(__file__).parent.parent / "static")
+DB_PATH = str(Path(__file__).parent.parent / "src" / "ttc_gtfs.duckdb")
+def get_latest_gtfs_url():
+    """Queries the CKAN API to find the current download URL for the GTFS ZIP."""
+    params = {"id": PACKAGE_ID}
+    response = requests.get(CKAN_BASE_URL, params=params)
+    data = response.json()
+    # We look for the resource that is a ZIP file and contains 'GTFS' in its name
+    for resource in data["result"]["resources"]:
+        if resource["format"].lower() == "zip":
+            return resource["url"]
+    return None
+def run_full_sync():
+    download_url = get_latest_gtfs_url()
+    if not download_url:
+        print("Could not find GTFS ZIP via API.")
+        return False
+    print(f"Found latest GTFS at: {download_url}")
+    # 1. Clear old files
+    if os.path.exists(STATIC_DIR):
+        print(f"Clearing existing static directory: {STATIC_DIR}")
+        shutil.rmtree(STATIC_DIR)
+    os.makedirs(STATIC_DIR)
+    # 2. Download and Extract
+    print("Downloading and extracting...")
+    r = requests.get(download_url)
+    print(f"Downloaded {len(r.content):,} bytes")
+    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
+        file_list = z.namelist()
+        print(f"Extracting {len(file_list)} files to {STATIC_DIR}...")
+        z.extractall(STATIC_DIR)
+        print(f"✓ Extracted {len(os.listdir(STATIC_DIR))} files")
+    # 3. Force DB rebuild by deleting the old DuckDB file
+    if os.path.exists(DB_PATH):
+        print(f"Deleting old database: {DB_PATH}")
+        os.remove(DB_PATH)
+    print("✓ Sync complete. Database will rebuild on next API call.")
+    return True
+if __name__ == "__main__":
+    run_full_sync()

api/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from datetime import datetime, date, time, timedelta
+def hms_to_seconds(hms_str):
+    """Converts GTFS 'HH:MM:SS' (e.g. '25:30:00') to total seconds from midnight."""
+    h, m, s = map(int, hms_str.split(':'))
+    return (h * 3600) + (m * 60) + s
+def get_service_day_start_ts():
+    """
+    Returns the Unix timestamp for 00:00:00 of the CURRENT service day.
+    TTC service day typically flips at 4:00 AM.
+    """
+    now = datetime.now()
+    # If it's 2 AM, we are still technically on 'yesterday's' schedule
+    if now.hour < 4:
+        service_date = date.today() - timedelta(days=1)
+    else:
+        service_date = date.today()
+    # Combine that date with 00:00:00 and get the timestamp
+    service_start_dt = datetime.combine(service_date, time.min)
+    return int(service_start_dt.timestamp())
+def translate_occupancy(status):
+    """Maps GTFS occupancy enums to human readable strings."""
+    mapping = {
+        0: "Empty", 1: "Many Seats Available", 2: "Few Seats Available",
+        3: "No Seats Available", 5: "Full", 6: "Not In Service"
+    }
+    return mapping.get(status, "Full") # when in doubt assume the bus is full

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+duckdb
+httpx
+python-dotenv
+gtfs-realtime-bindings
+protobuf

src/app.py ADDED Viewed

	@@ -0,0 +1,321 @@

+from datetime import datetime # type: ignore
+import sys # type: ignore
+from pathlib import Path
+# Add parent directory to path to allow imports from api/
+sys.path.insert(0, str(Path(__file__).parent.parent))
+# Add src directory to path to allow imports from same directory
+sys.path.insert(0, str(Path(__file__).parent))
+from api.bus_cache import AsyncBusCache # type: ignore
+from api.utils import hms_to_seconds, get_service_day_start_ts, translate_occupancy # type: ignore
+from db_manager import init_db # type: ignore
+from dotenv import load_dotenv # type: ignore
+from fastapi import FastAPI, HTTPException # type: ignore
+from fastapi.middleware.cors import CORSMiddleware # type: ignore
+load_dotenv()
+ttc_cache = AsyncBusCache(ttl=20)
+# Initialize database connection globally
+db = init_db()
+app = FastAPI(title="WheresMyBus v2.0 API")
+# Setup CORS for your React frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], # In production, use your actual React URL
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def health_check():
+    """Simple health check endpoint"""
+    return "backend is running"
+@app.get("/api/vehicles")
+async def get_vehicles():
+    data = await ttc_cache.get_data()
+    vehicles = data.get("vehicles", [])
+    return {
+        "status": "success",
+        "count": len(vehicles),
+        "vehicles": vehicles
+    }
+@app.get("/api/routes")
+async def get_all_routes():
+    """
+    Returns a complete list of TTC routes with their display names and colors.
+    """
+    try:
+        # Run the query against DuckDB
+        # We handle missing colors by providing defaults (TTC Red: #FF0000)
+        query = """
+            SELECT
+                route_id,
+                route_short_name,
+                route_long_name,
+                COALESCE(route_color, 'FF0000') as route_color,
+                COALESCE(route_text_color, 'FFFFFF') as route_text_color
+            FROM routes
+            ORDER BY
+                CASE
+                    WHEN CAST(route_short_name AS VARCHAR) ~ '^[0-9]+$' THEN CAST(route_short_name AS INTEGER)
+                    ELSE 999
+                END,
+                route_short_name;
+        """
+        results = db.execute(query).fetchall()
+        # Convert to a clean list of dictionaries
+        route_list = [
+            {
+                "id": r[0],
+                "number": r[1],
+                "name": r[2],
+                "color": f"#{r[3]}",
+                "text_color": f"#{r[4]}"
+            }
+            for r in results
+        ]
+        return {
+            "status": "success",
+            "count": len(route_list),
+            "routes": route_list
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+@app.get("/api/routes/{route_id}")
+async def get_route_view(route_id: str):
+    data = await ttc_cache.get_data()
+    all_buses = data.get("vehicles", [])
+    route_buses = [v for v in all_buses if v['route'] == route_id]
+    if not route_buses:
+        return {"route": route_id, "vehicles": []}
+    # IMPORTANT: Cast Trip IDs to strings to ensure they match the DB
+    trip_ids = [str(v['trip_id']) for v in route_buses]
+    placeholders = ','.join(['?'] * len(trip_ids))
+    # We use CAST(? AS VARCHAR) to force DuckDB to match strings to strings
+    query = f"""
+        SELECT
+            CAST(st.trip_id AS VARCHAR),
+            CAST(st.stop_id AS VARCHAR),
+            st.arrival_time,
+            t.trip_headsign
+        FROM stop_times st
+        JOIN trips t ON CAST(st.trip_id AS VARCHAR) = CAST(t.trip_id AS VARCHAR)
+        WHERE CAST(st.trip_id AS VARCHAR) IN ({placeholders})
+    """
+    db_rows = db.execute(query, trip_ids).fetchall()
+    # Check if we got ANYTHING back from the DB
+    if not db_rows:
+        print(f"DEBUG: No matches in DB for Trip IDs: {trip_ids[:3]}")
+    schedule_map = {(r[0], r[1]): r[2] for r in db_rows}
+    name_map = {r[0]: r[3] for r in db_rows}
+    service_day_ts = get_service_day_start_ts()
+    enriched = []
+    for bus in route_buses:
+        # Default delay is 0 if no prediction exists
+        raw_delay_mins = 0
+        pred_time = bus.get('predicted_time')
+        stop_id = bus.get('next_stop_id')
+        if pred_time and stop_id:
+            sched_hms = schedule_map.get((str(bus['trip_id']), str(stop_id)))
+            if sched_hms:
+                # Math: (Reality Unix - Plan Unix) / 60
+                plan_ts = service_day_ts + hms_to_seconds(sched_hms)
+                raw_delay_mins = round((pred_time - plan_ts) / 60)
+        enriched.append({
+            "number": bus['id'],
+            "name": name_map.get(str(bus['trip_id']), "Not in Schedule"), # This is the destination
+            "location": {"lat": bus['lat'], "lon": bus['lon']},
+            "delay_mins": raw_delay_mins, # Actual integer: 5 = 5m late, -2 = 2m early
+            "fullness": translate_occupancy(bus['occupancy'])
+        })
+    return {
+        "route": route_id,
+        "count": len(enriched),
+        "vehicles": enriched
+    }
+@app.get("/api/vehicles/{vehicle_id}")
+async def get_vehicle_view(vehicle_id: str):
+    # 1. Pull latest from cache
+    data = await ttc_cache.get_data()
+    vehicles = data.get("vehicles", [])
+    # 2. Find this specific bus in the list
+    bus = next((v for v in vehicles if str(v['id']) == vehicle_id), None)
+    if not bus:
+        raise HTTPException(status_code=404, detail="Vehicle not active or not found")
+    trip_id = str(bus['trip_id'])
+    next_stop_id = bus.get('next_stop_id')
+    predicted_time = bus.get('predicted_time')
+    # 3. Handshake with Database (Cast to VARCHAR to avoid type errors)
+    # We get the destination name and the specific scheduled arrival time
+    destination = "Not in Schedule"
+    delay_mins = 0
+    if next_stop_id:
+        query = """
+            SELECT
+                t.trip_headsign,
+                st.arrival_time
+            FROM trips t
+            JOIN stop_times st ON CAST(t.trip_id AS VARCHAR) = CAST(st.trip_id AS VARCHAR)
+            WHERE CAST(t.trip_id AS VARCHAR) = ?
+              AND CAST(st.stop_id AS VARCHAR) = ?
+            LIMIT 1
+        """
+        row = db.execute(query, [trip_id, str(next_stop_id)]).fetchone()
+        if row:
+            destination = row[0]
+            scheduled_hms = row[1]
+            # Math: Reality (Unix Time) - Plan (Service Day + Scheduled Seconds)
+            if predicted_time:
+                service_day_ts = get_service_day_start_ts()
+                plan_ts = service_day_ts + hms_to_seconds(scheduled_hms)
+                delay_mins = round((predicted_time - plan_ts) / 60)
+    else:
+        # If no next_stop_id, try to get destination from trip_id only
+        query = """
+            SELECT trip_headsign
+            FROM trips
+            WHERE CAST(trip_id AS VARCHAR) = ?
+            LIMIT 1
+        """
+        row = db.execute(query, [trip_id]).fetchone()
+        if row:
+            destination = row[0]
+    return {
+        "vehicle_number": vehicle_id,
+        "route_id": bus['route'],
+        "name": destination,
+        "location": {
+            "lat": bus['lat'],
+            "lon": bus['lon']
+        },
+        "delay_mins": delay_mins,
+        "fullness": translate_occupancy(bus['occupancy']),
+        "trip_id": trip_id
+    }
+@app.get("/api/stop/{stop_code}")
+async def get_stop_view(stop_code: str):
+    # 1. Translate Pole Number to Database ID
+    stop_info = db.execute("SELECT stop_id, stop_name FROM stops WHERE CAST(stop_code AS VARCHAR) = ? LIMIT 1", [str(stop_code)]).fetchone()
+    if not stop_info:
+        return {"error": "Stop code not found"}
+    target_id = str(stop_info[0])
+    stop_name = stop_info[1]
+    # 2. Get the Cache structure (dict with vehicles, predictions, alerts)
+    cached_data = await ttc_cache.get_data()
+    vehicles_list = cached_data.get("vehicles", [])
+    predictions = cached_data.get("predictions", {})
+    # Build vehicles map for quick lookup
+    vehicles = {str(v['trip_id']): v for v in vehicles_list}
+    now = datetime.now().timestamp()
+    two_hours_out = now + 7200
+    arrivals = []
+    # 3. Search the FULL itineraries for our target_id
+    for trip_id, itinerary in predictions.items():
+        if target_id in itinerary:
+            pred_time = itinerary[target_id]
+            # Only include if the bus hasn't passed the stop yet and is within 2 hours
+            if now <= pred_time <= two_hours_out:
+                # 4. Handshake with DB for destination and schedule
+                query = """
+                    SELECT t.trip_headsign, st.arrival_time, r.route_short_name
+                    FROM trips t
+                    JOIN stop_times st ON CAST(t.trip_id AS VARCHAR) = CAST(st.trip_id AS VARCHAR)
+                    JOIN routes r ON t.route_id = r.route_id
+                    WHERE CAST(t.trip_id AS VARCHAR) = ? AND CAST(st.stop_id AS VARCHAR) = ?
+                    LIMIT 1
+                """
+                row = db.execute(query, [trip_id, target_id]).fetchone()
+                if row:
+                    # Find the actual bus for fullness (if it's on the road)
+                    bus = vehicles.get(trip_id)
+                    plan_ts = get_service_day_start_ts() + hms_to_seconds(row[1])
+                    arrivals.append({
+                        "route": row[2],
+                        "destination": row[0],
+                        "eta_mins": round((pred_time - now) / 60),
+                        "delay_mins": round((pred_time - plan_ts) / 60),
+                        "fullness": translate_occupancy(bus['occupancy']) if bus else "Unknown",
+                        "vehicle_id": bus['id'] if bus else "In Transit"
+                    })
+    arrivals.sort(key=lambda x: x['eta_mins'])
+    return {"stop_name": stop_name, "stop_code": stop_code, "arrivals": arrivals}
+@app.get("/api/alerts")
+async def get_all_alerts():
+    """
+    Returns every active service alert for the entire TTC network.
+    """
+    data = await ttc_cache.get_data()
+    return {
+        "timestamp": datetime.now().timestamp(),
+        "count": len(data["alerts"]),
+        "alerts": data["alerts"]
+    }
+@app.get("/api/alerts/{route_id}")
+async def get_alerts_for_route(route_id: str):
+    data = await ttc_cache.get_data()
+    alerts = data.get("alerts", {})
+    route_alerts = alerts.get(route_id, [])
+    if not route_alerts:
+        return {
+            "route_id": route_id,
+            "count": 0,
+            "alerts": "No alerts"
+        }
+    return {
+        "route_id": route_id,
+        "count": len(route_alerts),
+        "alerts": route_alerts
+    }
+if __name__ == "__main__":
+    import uvicorn # type: ignore
+    # Start the server
+    uvicorn.run(app, host="0.0.0.0", port=7860)

src/db_manager.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import duckdb # type: ignore
+import os # type: ignore
+import sys # type: ignore
+from pathlib import Path # type: ignore
+from dotenv import load_dotenv # type: ignore
+# Add parent directory to path to allow imports from api/
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from api.bus_cache import AsyncBusCache # type: ignore
+# Configuration - always save DB in src/ directory
+DB_PATH = str(Path(__file__).parent / "ttc_gtfs.duckdb")
+STATIC_DIR = str(Path(__file__).parent.parent / "static")
+def init_db():
+    """
+    Connects to DuckDB and imports the GTFS-Static data from the static/ directory.
+    """
+    # 1. Connect to DuckDB (creates the file if it doesn't exist)
+    con = duckdb.connect(DB_PATH)
+    # 2. Check if the database is already populated
+    tables = con.execute("SHOW TABLES").fetchall()
+    if ('stop_times',) in tables:
+        print("--- Database already exists and is populated ---")
+        return con
+    print("--- Initializing DuckDB: Importing CSVs from /static ---")
+    # Core GTFS files we need for the rework
+    files = ["routes.txt", "trips.txt", "stops.txt", "stop_times.txt"]
+    for f in files:
+        file_path = Path(STATIC_DIR) / f
+        table_name = f.replace(".txt", "")
+        if file_path.exists():
+            print(f"Loading {f} into table '{table_name}'...")
+            # 'read_csv_auto' automatically detects headers and data types
+            # Use absolute path for DuckDB
+            abs_file_path = str(file_path.resolve())
+            con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{abs_file_path}')")
+        else:
+            print(f"Error: {file_path} not found! Please ensure it is in the static/ folder.")
+    print("--- Database Import Complete ---")
+    return con
+async def test_data_integrity(con):
+    """
+    Runs a test join to confirm that a trip ID can be linked to a route name and stop list.
+    Uses AsyncBusCache to get a real trip_id from the live API.
+    """
+    print("--- Running Integrity Test ---")
+    try:
+        # Get trip_id from live API using AsyncBusCache
+        cache = AsyncBusCache(ttl=20)
+        vehicles = await cache.get_data()
+        if not vehicles:
+            print("No vehicles available from API, falling back to database trip_id")
+            sample_trip = con.execute("SELECT trip_id FROM trips LIMIT 1").fetchone()[0]
+        else:
+            # Extract trip_id from the first vehicle
+            # We need to get the raw GTFS data to access trip_id
+            import httpx # type: ignore
+            from google.transit import gtfs_realtime_pb2 # type: ignore
+            load_dotenv()
+            gtfs_rt_url = os.getenv("GTFS_RT_URL")
+            if not gtfs_rt_url:
+                raise ValueError("GTFS_RT_URL is not set")
+            async with httpx.AsyncClient() as client:
+                response = await client.get(gtfs_rt_url, timeout=10)
+                response.raise_for_status()
+            feed = gtfs_realtime_pb2.FeedMessage()
+            feed.ParseFromString(response.content)
+            # Get trip_id from first vehicle entity
+            sample_trip = None
+            for entity in feed.entity:
+                if entity.HasField('vehicle') and entity.vehicle.trip.trip_id:
+                    sample_trip = entity.vehicle.trip.trip_id
+                    break
+            if not sample_trip:
+                print("No trip_id found in API response, falling back to database")
+                sample_trip = con.execute("SELECT trip_id FROM trips LIMIT 1").fetchone()[0]
+            else:
+                print(f"Using trip_id from live API: {sample_trip}")
+        # First, get the total count
+        count_query = f"""
+            SELECT COUNT(*)
+            FROM trips t
+            JOIN stop_times st ON t.trip_id = st.trip_id
+            WHERE t.trip_id = '{sample_trip}'
+        """
+        total_count = con.execute(count_query).fetchone()[0]
+        # Determine sample size - show all if <= 20, otherwise show first 20
+        sample_size = min(20, total_count) if total_count > 20 else total_count
+        query = f"""
+            SELECT
+                r.route_short_name,
+                t.trip_headsign,
+                st.stop_sequence,
+                s.stop_name
+            FROM trips t
+            JOIN routes r ON t.route_id = r.route_id
+            JOIN stop_times st ON t.trip_id = st.trip_id
+            JOIN stops s ON st.stop_id = s.stop_id
+            WHERE t.trip_id = '{sample_trip}'
+            ORDER BY st.stop_sequence
+            LIMIT {sample_size};
+        """
+        results = con.execute(query).fetchall()
+        print(f"\nSuccessfully joined data for Trip ID: {sample_trip}")
+        print(f"Total stops in trip: {total_count}")
+        if total_count > sample_size:
+            print(f"Showing first {sample_size} stops (sample):\n")
+        else:
+            print(f"Showing all {total_count} stops:\n")
+        print(f"{'Route':<8} {'Headsign':<30} {'Stop #':<8} {'Stop Name':<50}")
+        print("-" * 100)
+        for res in results:
+            route = res[0] or "N/A"
+            headsign = (res[1] or "N/A")[:28]  # Truncate if too long
+            stop_seq = res[2]
+            stop_name = (res[3] or "N/A")[:48]  # Truncate if too long
+            print(f"{route:<8} {headsign:<30} {stop_seq:<8} {stop_name:<50}")
+    except Exception as e:
+        print(f"Integrity test failed: {e}")
+if __name__ == "__main__":
+    import asyncio # type: ignore
+    db_con = init_db()
+    asyncio.run(test_data_integrity(db_con))

src/ttc_gtfs.duckdb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eec5ac01f4d0f0dcd888097054dc7d4e4b849ecbe68012c0c8eb54a4e941d8f
+size 53751808