Skip to content

Commit 065a3a3

Browse files
authored
Add files via upload
Added tutorial on "How to Build a Production-Ready Multi-Agent Incident Response System Using OpenAI Swarm and Tool-Augmented Agents"
1 parent 1085876 commit 065a3a3

File tree

1 file changed

+336
-0
lines changed

1 file changed

+336
-0
lines changed
Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"source": [
20+
"!pip -q install -U openai\n",
21+
"!pip -q install -U \"git+https://github.com/openai/swarm.git\"\n",
22+
"\n",
23+
"import os\n",
24+
"\n",
25+
"def load_openai_key():\n",
26+
" try:\n",
27+
" from google.colab import userdata\n",
28+
" key = userdata.get(\"OPENAI_API_KEY\")\n",
29+
" except Exception:\n",
30+
" key = None\n",
31+
" if not key:\n",
32+
" import getpass\n",
33+
" key = getpass.getpass(\"Enter OPENAI_API_KEY (hidden): \").strip()\n",
34+
" if not key:\n",
35+
" raise RuntimeError(\"OPENAI_API_KEY not provided\")\n",
36+
" return key\n",
37+
"\n",
38+
"os.environ[\"OPENAI_API_KEY\"] = load_openai_key()"
39+
],
40+
"metadata": {
41+
"id": "F2XYMogF-9ir"
42+
},
43+
"execution_count": null,
44+
"outputs": []
45+
},
46+
{
47+
"cell_type": "code",
48+
"source": [
49+
"import json\n",
50+
"import re\n",
51+
"from typing import List, Dict\n",
52+
"from swarm import Swarm, Agent\n",
53+
"\n",
54+
"client = Swarm()"
55+
],
56+
"metadata": {
57+
"id": "H_Q2p-1f-9a9"
58+
},
59+
"execution_count": null,
60+
"outputs": []
61+
},
62+
{
63+
"cell_type": "code",
64+
"source": [
65+
"KB_DOCS = [\n",
66+
" {\n",
67+
" \"id\": \"kb-incident-001\",\n",
68+
" \"title\": \"API Latency Incident Playbook\",\n",
69+
" \"text\": \"If p95 latency spikes, validate deploys, dependencies, and error rates. Rollback, cache, rate-limit, scale. Compare p50 vs p99 and inspect upstream timeouts.\"\n",
70+
" },\n",
71+
" {\n",
72+
" \"id\": \"kb-risk-001\",\n",
73+
" \"title\": \"Risk Communication Guidelines\",\n",
74+
" \"text\": \"Updates must include impact, scope, mitigation, owner, and next update. Avoid blame and separate internal vs external messaging.\"\n",
75+
" },\n",
76+
" {\n",
77+
" \"id\": \"kb-ops-001\",\n",
78+
" \"title\": \"On-call Handoff Template\",\n",
79+
" \"text\": \"Include summary, timeline, current status, mitigations, open questions, next actions, and owners.\"\n",
80+
" },\n",
81+
"]\n",
82+
"\n",
83+
"def _normalize(s: str) -> List[str]:\n",
84+
" return re.sub(r\"[^a-z0-9\\s]\", \" \", s.lower()).split()\n",
85+
"\n",
86+
"def search_kb(query: str, top_k: int = 3) -> str:\n",
87+
" q = set(_normalize(query))\n",
88+
" scored = []\n",
89+
" for d in KB_DOCS:\n",
90+
" score = len(q.intersection(set(_normalize(d[\"title\"] + \" \" + d[\"text\"]))))\n",
91+
" scored.append((score, d))\n",
92+
" scored.sort(key=lambda x: x[0], reverse=True)\n",
93+
" docs = [d for s, d in scored[:top_k] if s > 0] or [scored[0][1]]\n",
94+
" return json.dumps(docs, indent=2)"
95+
],
96+
"metadata": {
97+
"id": "QM4M4Prs-9X_"
98+
},
99+
"execution_count": null,
100+
"outputs": []
101+
},
102+
{
103+
"cell_type": "code",
104+
"source": [
105+
"def estimate_mitigation_impact(options_json: str) -> str:\n",
106+
" try:\n",
107+
" options = json.loads(options_json)\n",
108+
" except Exception as e:\n",
109+
" return json.dumps({\"error\": str(e)})\n",
110+
" ranking = []\n",
111+
" for o in options:\n",
112+
" conf = float(o.get(\"confidence\", 0.5))\n",
113+
" risk = o.get(\"risk\", \"medium\")\n",
114+
" penalty = {\"low\": 0.1, \"medium\": 0.25, \"high\": 0.45}.get(risk, 0.25)\n",
115+
" ranking.append({\n",
116+
" \"option\": o.get(\"option\"),\n",
117+
" \"confidence\": conf,\n",
118+
" \"risk\": risk,\n",
119+
" \"score\": round(conf - penalty, 3)\n",
120+
" })\n",
121+
" ranking.sort(key=lambda x: x[\"score\"], reverse=True)\n",
122+
" return json.dumps(ranking, indent=2)"
123+
],
124+
"metadata": {
125+
"id": "dkKtu2_c-9VY"
126+
},
127+
"execution_count": null,
128+
"outputs": []
129+
},
130+
{
131+
"cell_type": "code",
132+
"source": [
133+
"def handoff_to_sre():\n",
134+
" return sre_agent\n",
135+
"\n",
136+
"def handoff_to_comms():\n",
137+
" return comms_agent\n",
138+
"\n",
139+
"def handoff_to_handoff_writer():\n",
140+
" return handoff_writer_agent\n",
141+
"\n",
142+
"def handoff_to_critic():\n",
143+
" return critic_agent"
144+
],
145+
"metadata": {
146+
"id": "F9g89NOe-9Sm"
147+
},
148+
"execution_count": null,
149+
"outputs": []
150+
},
151+
{
152+
"cell_type": "code",
153+
"source": [
154+
"triage_agent = Agent(\n",
155+
" name=\"Triage\",\n",
156+
" model=\"gpt-4o-mini\",\n",
157+
" instructions=\"\"\"\n",
158+
"Decide which agent should handle the request.\n",
159+
"Use SRE for incident response.\n",
160+
"Use Comms for customer or executive messaging.\n",
161+
"Use HandoffWriter for on-call notes.\n",
162+
"Use Critic for review or improvement.\n",
163+
"\"\"\",\n",
164+
" functions=[search_kb, handoff_to_sre, handoff_to_comms, handoff_to_handoff_writer, handoff_to_critic]\n",
165+
")\n",
166+
"\n",
167+
"sre_agent = Agent(\n",
168+
" name=\"SRE\",\n",
169+
" model=\"gpt-4o-mini\",\n",
170+
" instructions=\"\"\"\n",
171+
"Produce a structured incident response with triage steps,\n",
172+
"ranked mitigations, ranked hypotheses, and a 30-minute plan.\n",
173+
"\"\"\",\n",
174+
" functions=[search_kb, estimate_mitigation_impact]\n",
175+
")\n",
176+
"\n",
177+
"comms_agent = Agent(\n",
178+
" name=\"Comms\",\n",
179+
" model=\"gpt-4o-mini\",\n",
180+
" instructions=\"\"\"\n",
181+
"Produce an external customer update and an internal technical update.\n",
182+
"\"\"\",\n",
183+
" functions=[search_kb]\n",
184+
")\n",
185+
"\n",
186+
"handoff_writer_agent = Agent(\n",
187+
" name=\"HandoffWriter\",\n",
188+
" model=\"gpt-4o-mini\",\n",
189+
" instructions=\"\"\"\n",
190+
"Produce a clean on-call handoff document with standard headings.\n",
191+
"\"\"\",\n",
192+
" functions=[search_kb]\n",
193+
")\n",
194+
"\n",
195+
"critic_agent = Agent(\n",
196+
" name=\"Critic\",\n",
197+
" model=\"gpt-4o-mini\",\n",
198+
" instructions=\"\"\"\n",
199+
"Critique the previous answer, then produce a refined final version and a checklist.\n",
200+
"\"\"\"\n",
201+
")"
202+
],
203+
"metadata": {
204+
"id": "KPCgM0Mt_6Xf"
205+
},
206+
"execution_count": null,
207+
"outputs": []
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": 2,
212+
"metadata": {
213+
"colab": {
214+
"base_uri": "https://localhost:8080/"
215+
},
216+
"id": "_3r0_ZQm8Kv4",
217+
"outputId": "9245d520-d115-406a-ff77-1729db9d295e"
218+
},
219+
"outputs": [
220+
{
221+
"output_type": "stream",
222+
"name": "stdout",
223+
"text": [
224+
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
225+
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
226+
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
227+
"Enter OPENAI_API_KEY (hidden): ··········\n",
228+
"\n",
229+
"==========================================================================================\n",
230+
"FINAL AGENT: Critic\n",
231+
"==========================================================================================\n",
232+
"### 1) 5-Point Critique:\n",
233+
"\n",
234+
"1. **Clarity and Brevity:** The updates should maintain a clear separation between internal and external communications. The current phrasing can be streamlined, especially the internal update which is verbose.\n",
235+
"\n",
236+
"2. **Action Items Specificity:** The actions proposed in the internal update are broad. Specific actions should be clearly defined, such as who exactly will execute them and deadlines for when they should be completed.\n",
237+
"\n",
238+
"3. **Customer Context:** The external update lacks context for customers about why this issue may impact them or what they can expect while it’s resolved. Including a reassurance would improve customer confidence.\n",
239+
"\n",
240+
"4. **Next Steps Measurement:** Mentioning specific metrics or expected outcomes for the next update would provide both internal and external audiences with clear expectations for resolution.\n",
241+
"\n",
242+
"5. **Technical Jargon:** While the internal audience may understand technical jargon, it would be beneficial to define or simplify some terminology (like \"p95\" or \"upstream timeouts\") so that all stakeholders fully grasp the situation.\n",
243+
"\n",
244+
"### 2) Revised Best Answer:\n",
245+
"\n",
246+
"#### **External Update (Customer-Friendly)**\n",
247+
"\n",
248+
"**Impact:** We are currently facing increased latency in our service, escalating from 250 milliseconds to 2.5 seconds for some users.\n",
249+
"\n",
250+
"**Scope:** This latency issue affects a subset of our users. Our team is actively investigating and addressing the situation.\n",
251+
"\n",
252+
"**Mitigation:** We are exploring multiple solutions:\n",
253+
"- Rolling back the most recent deployment.\n",
254+
"- Enabling caching to improve response times.\n",
255+
"- Optimizing our services to handle demand better.\n",
256+
"\n",
257+
"**ETA/Next Update:** We will provide an update in 30 minutes as we continue our investigation and implement these solutions.\n",
258+
"\n",
259+
"---\n",
260+
"\n",
261+
"#### **Internal Update (Technical)**\n",
262+
"\n",
263+
"**Current Situation:** After the latest deployment, we notice p95 response time has increased significantly from 250ms to 2.5s. Error rates are slightly elevated, with increased upstream timeouts but stable database CPU usage.\n",
264+
"\n",
265+
"**Immediate Actions:**\n",
266+
"1. Roll back the latest deployment by [specific time].\n",
267+
"2. Enable caching for all affected endpoints by [specific time].\n",
268+
"3. Rate-limit requests to at-risk services by [specific time].\n",
269+
"4. Analyze latency metrics (comparing p50 vs p99) to identify specific bottlenecks, focusing on slow DB queries and troublesome upstream services.\n",
270+
"\n",
271+
"**Who's Responsible:**\n",
272+
"- **Incident Manager:** [Name]\n",
273+
"- **Engineering Lead:** [Name]\n",
274+
"- **Database Administrator:** [Name]\n",
275+
"\n",
276+
"**Next Steps:** Each responsible party is to report their progress and findings every 30 minutes until the situation stabilizes, or the issue is resolved.\n",
277+
"\n",
278+
"### 3) Reusable Checklist:\n",
279+
"\n",
280+
"**For External Updates:**\n",
281+
"- [ ] Clearly state the impact of the issue.\n",
282+
"- [ ] Specify the scope of affected users.\n",
283+
"- [ ] Describe ongoing mitigations clearly.\n",
284+
"- [ ] Provide a timeline for the next update.\n",
285+
"- [ ] Utilize customer-friendly language.\n",
286+
"\n",
287+
"**For Internal Updates:**\n",
288+
"- [ ] Summarize the current situation with technical metrics.\n",
289+
"- [ ] Specify immediate actions and clear deadlines.\n",
290+
"- [ ] Assign responsibilities to clear owners.\n",
291+
"- [ ] Set expectations for future updates.\n",
292+
"- [ ] Use straightforward language while maintaining necessary technical detail.\n",
293+
"\n",
294+
"\n",
295+
"--- TRACE (last ~8 messages) ---\n",
296+
"\n",
297+
"[Critic]\n",
298+
"### 1) 5-Point Critique:\n",
299+
"\n",
300+
"1. **Clarity and Brevity:** The updates should maintain a clear separation between internal and external communications. The current phrasing can be streamlined, especially the internal update which is verbose.\n",
301+
"\n",
302+
"2. **Action Items Specificity:** The actions proposed in the internal update are broad. Specific actions should be clearly defined, such as who exactly will execute them and deadlines for when they should be completed.\n",
303+
"\n",
304+
"3. **Customer Context:** The external update lacks context for customers about why this issue may impact them or what they can expect while it’s resolved. Including a reassurance would improve customer confidence.\n",
305+
"\n",
306+
"4. **Next Steps Measurement:** Mentioning specific metrics or expected outcomes for the next update would provide both internal and external audiences with clear expectations for resolution.\n",
307+
"\n",
308+
"5. **Technical Jargon:** While the internal audience may understand technical jargon, it would be beneficial to define or simplify some terminology (like \"p95\" or \"upstream timeouts\") so that all stakeholders fully grasp the situation.\n",
309+
"\n",
310+
"### 2) Revised Best Answer:\n",
311+
"\n",
312+
"#### **External Update (Customer-Friendly)**\n",
313+
"\n",
314+
"**Impact:** We are cur\n"
315+
]
316+
}
317+
],
318+
"source": [
319+
"def run_pipeline(user_request: str):\n",
320+
" messages = [{\"role\": \"user\", \"content\": user_request}]\n",
321+
" r1 = client.run(agent=triage_agent, messages=messages, max_turns=8)\n",
322+
" messages2 = r1.messages + [{\"role\": \"user\", \"content\": \"Review and improve the last answer\"}]\n",
323+
" r2 = client.run(agent=critic_agent, messages=messages2, max_turns=4)\n",
324+
" return r2.messages[-1][\"content\"]\n",
325+
"\n",
326+
"request = \"\"\"\n",
327+
"Production p95 latency jumped from 250ms to 2.5s after a deploy.\n",
328+
"Errors slightly increased, DB CPU stable, upstream timeouts rising.\n",
329+
"Provide a 30-minute action plan and a customer update.\n",
330+
"\"\"\"\n",
331+
"\n",
332+
"print(run_pipeline(request))"
333+
]
334+
}
335+
]
336+
}

0 commit comments

Comments
 (0)