Skip to content

Commit 6afb1e1

Browse files
committed
feat: add Classifiers notebook and Streamlit annotation tool for query labeling
1 parent 1aecb72 commit 6afb1e1

File tree

4 files changed

+1580
-0
lines changed

4 files changed

+1580
-0
lines changed

3. Classifiers.ipynb

Lines changed: 1395 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
**2. Better Summaries**: Learn how to create domain-specific, concise summaries for Weights & Biases queries to produce more meaningful and actionable topic clusters.
2828

29+
**3. Classifiers**: Learn how to create classifiers that can detect and monitor these topics that you've identified in production.
30+
2931
## About
3032

3133
This repository was created for the **AI Engineering Summit**. It demonstrates practical techniques for analyzing and improving Retrieval-Augmented Generation (RAG) systems using real-world data and modern topic modeling tools.

app.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
import streamlit as st
2+
import json
3+
import os
4+
from streamlit_shortcuts import button
5+
from pathlib import Path
6+
7+
# Ensure data directory exists
8+
Path("./data").mkdir(exist_ok=True)
9+
10+
# App title and layout configuration
11+
st.set_page_config(layout="wide", page_title="Simple Annotation Tool")
12+
st.title("Simple Annotation Tool")
13+
14+
15+
# Load conversations
16+
@st.cache_data
17+
def load_conversations():
18+
try:
19+
with open("./data/conversations.json", "r") as f:
20+
return json.load(f)
21+
except FileNotFoundError:
22+
st.error("conversations.json file not found in ./data directory!")
23+
return []
24+
25+
26+
# Initialize or load labels
27+
def load_labels():
28+
labels_file = "./data/labels.jsonl"
29+
labels = {}
30+
if os.path.exists(labels_file):
31+
with open(labels_file, mode="r") as f:
32+
for line in f:
33+
if line.strip():
34+
item = json.loads(line)
35+
labels[item["query_id"]] = item["label"]
36+
return labels
37+
38+
39+
# Save label
40+
def save_label(conversation, label_value):
41+
labels_file = "./data/labels.jsonl"
42+
label_data = {
43+
"query_id": conversation["query_id"],
44+
"query": conversation["query"],
45+
"matching_document": conversation["matching_document"],
46+
"label": label_value,
47+
}
48+
49+
# Append to the jsonl file
50+
with open(labels_file, mode="a") as f:
51+
f.write(json.dumps(label_data) + "\n")
52+
53+
# Update session state
54+
st.session_state.labels[conversation["query_id"]] = label_value
55+
st.session_state.next_item = True
56+
57+
58+
# Label functions
59+
def label_artifact():
60+
if st.session_state.current_index < len(st.session_state.conversations):
61+
save_label(
62+
st.session_state.conversations[st.session_state.current_index], "artifact"
63+
)
64+
65+
66+
def label_no_artifact():
67+
if st.session_state.current_index < len(st.session_state.conversations):
68+
save_label(
69+
st.session_state.conversations[st.session_state.current_index],
70+
"not_artifact",
71+
)
72+
73+
74+
# Navigation functions
75+
def prev_item():
76+
if st.session_state.current_index > 0:
77+
st.session_state.current_index -= 1
78+
79+
80+
def next_item():
81+
if st.session_state.current_index < len(st.session_state.conversations) - 1:
82+
st.session_state.current_index += 1
83+
84+
85+
# Initialize session state
86+
if "current_index" not in st.session_state:
87+
st.session_state.current_index = 0
88+
st.session_state.conversations = load_conversations()
89+
st.session_state.labels = load_labels()
90+
st.session_state.next_item = False
91+
92+
# Initialize button counter for streamlit_shortcuts
93+
if "button_key_counter" not in st.session_state:
94+
st.session_state.button_key_counter = 0
95+
96+
# Handle navigation from previous interactions
97+
if st.session_state.next_item:
98+
if st.session_state.current_index < len(st.session_state.conversations) - 1:
99+
st.session_state.current_index += 1
100+
st.session_state.next_item = False
101+
102+
# Progress info in sidebar
103+
with st.sidebar:
104+
st.header("Annotation Progress")
105+
total = len(st.session_state.conversations)
106+
labeled = len(st.session_state.labels)
107+
remaining = total - labeled
108+
109+
st.write(f"**Labeled:** {labeled}/{total}")
110+
st.write(f"**Remaining:** {remaining}")
111+
st.progress(labeled / total if total > 0 else 0)
112+
113+
st.subheader("Keyboard Shortcuts")
114+
st.write("**Ctrl+E:** Artifact")
115+
st.write("**Ctrl+R:** No Artifact")
116+
st.write("**Ctrl+P:** Previous")
117+
st.write("**Ctrl+N:** Next")
118+
119+
# Main display area - simple layout with plain text
120+
if st.session_state.conversations:
121+
if st.session_state.current_index < len(st.session_state.conversations):
122+
conversation = st.session_state.conversations[st.session_state.current_index]
123+
124+
# Simple item indicator at the top
125+
status_row = st.columns([3, 1])
126+
with status_row[0]:
127+
st.text(
128+
f"Item {st.session_state.current_index + 1} of {len(st.session_state.conversations)}"
129+
)
130+
with status_row[1]:
131+
already_labeled = conversation["query_id"] in st.session_state.labels
132+
if already_labeled:
133+
label = st.session_state.labels[conversation["query_id"]]
134+
st.text(f"[LABELED: {label}]")
135+
136+
# Clear keyboard shortcuts reminder at the top
137+
st.markdown("**Ctrl+E**: Mark as Artifact | **Ctrl+R**: Mark as No-Artifact")
138+
st.markdown("---")
139+
140+
# Show query | document in a clean side-by-side layout
141+
cols = st.columns(2)
142+
143+
with cols[0]:
144+
st.text("QUERY:")
145+
st.text_area(
146+
"",
147+
conversation["query"],
148+
height=100,
149+
disabled=True,
150+
key="query_field",
151+
label_visibility="collapsed",
152+
)
153+
154+
with cols[1]:
155+
st.text("DOCUMENT:")
156+
st.text_area(
157+
"",
158+
conversation["matching_document"],
159+
height=300,
160+
disabled=True,
161+
key="doc_field",
162+
label_visibility="collapsed",
163+
)
164+
165+
# Simple button row at the bottom
166+
button_cols = st.columns([1, 1, 3])
167+
with button_cols[0]:
168+
if button("⭐️ Artifact", "ctrl+e", label_artifact, hint=True):
169+
label_artifact()
170+
with button_cols[1]:
171+
if button("❌ No-Artifact", "ctrl+r", label_no_artifact, hint=True):
172+
label_no_artifact()
173+
else:
174+
st.success("All items have been processed!")
175+
else:
176+
st.error(
177+
"No data available. Please ensure your conversations.json file is in the ./data directory."
178+
)

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@ description = "Add your description here"
55
readme = "README.md"
66
requires-python = ">=3.9"
77
dependencies = [
8+
"instructor-classify",
89
"ipykernel>=6.29.5",
910
"ipython>=8.18.1",
1011
"kura>=0.4.4",
12+
"streamlit-shortcuts>=0.2.0",
1113
]
14+
15+
[tool.uv.sources]
16+
instructor-classify = { git = "https://github.com/jxnl/instructor-classify" }

0 commit comments

Comments
 (0)