You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
"message": "Input not allowed. The input was blocked by the 'self check input' flow."
770
+
}
771
+
}
772
+
```
773
+
709
774
## Knowledge base Documents
710
775
711
776
By default, an `LLMRails` instance supports using a set of documents as context for generating the bot responses. To include documents as part of your knowledge base, you must place them in the `kb` folder inside your config folder:
Copy file name to clipboardExpand all lines: examples/configs/guardrails_only/output/config.co
+9-3Lines changed: 9 additions & 3 deletions
Original file line number
Diff line number
Diff line change
@@ -12,11 +12,17 @@ define bot deny
12
12
"DENY"
13
13
14
14
define subflow dummy output rail
15
-
"""A dummy input rail which checks if the word "dummy" is included in the text."""
15
+
"""A dummy output rail which checks if the word "dummy" is included in the text."""
16
16
if "dummy" in $bot_message
17
-
bot deny
17
+
if $config.enable_rails_exceptions
18
+
create event DummyOutputRailException(message="Dummy output detected. The bot's message contains the word 'dummy'.")
19
+
else
20
+
bot deny
18
21
stop
19
22
20
23
define subflow allow output
21
-
bot allow
24
+
if $config.enable_rails_exceptions
25
+
create event AllowOutputRailException(message="Allow output triggered. The bot will respond with 'ALLOW'. To see it in action you must disable the `enable_rails_exceptions` configuration setting.")
create event ActiveFenceModerationRailException(message="ActiveFence moderation triggered. The maximum risk score exceeded the threshold.")
23
+
else
24
+
bot refuse to respond
22
25
stop
23
26
24
27
define subflow activefence moderation detailed
25
28
"""Guardrail based on individual risk scores."""
26
29
$result = execute call activefence api
27
30
28
31
if $result.violations.get("abusive_or_harmful.harassment_or_bullying", 0) > 0.8
29
-
bot inform cannot engage in abusive or harmful behavior
32
+
if $config.enable_rails_exceptions
33
+
create event ActiveFenceHarassmentRailException(message="ActiveFence moderation triggered. The harassment or bullying risk score exceeded the threshold.")
34
+
else
35
+
bot inform cannot engage in abusive or harmful behavior
30
36
stop
31
37
32
38
if $result.violations.get("abusive_or_harmful.profanity", 0) > 0.7
33
-
bot inform cannot engage in abusive or harmful behavior
39
+
if $config.enable_rails_exceptions
40
+
create event ActiveFenceProfanityRailException(message="ActiveFence moderation triggered. The profanity risk score exceeded the threshold.")
41
+
else
42
+
bot inform cannot engage in abusive or harmful behavior
34
43
stop
35
44
36
45
if $result.violations.get("abusive_or_harmful.hate_speech", 0) > 0.6
37
-
bot inform cannot engage in abusive or harmful behavior
46
+
if $config.enable_rails_exceptions
47
+
create event ActiveFenceHateSpeechRailException(message="ActiveFence moderation triggered. The hate speech risk score exceeded the threshold.")
48
+
else
49
+
bot inform cannot engage in abusive or harmful behavior
38
50
stop
39
51
40
52
if $result.violations.get("abusive_or_harmful.child_grooming", 0) > 0.4
41
-
bot inform cannot engage in abusive or harmful behavior
53
+
if $config.enable_rails_exceptions
54
+
create event ActiveFenceChildGroomingRailException(message="ActiveFence moderation triggered. The child grooming risk score exceeded the threshold.")
55
+
else
56
+
bot inform cannot engage in abusive or harmful behavior
42
57
stop
43
58
44
59
if $result.violations.get("abusive_or_harmful.general_violence", 0) > 0.7
45
-
bot inform cannot engage in abusive or harmful behavior
60
+
if $config.enable_rails_exceptions
61
+
create event ActiveFenceGeneralViolenceRailException(message="ActiveFence moderation triggered. The general violence risk score exceeded the threshold.")
62
+
else
63
+
bot inform cannot engage in abusive or harmful behavior
46
64
stop
47
65
48
66
if $result.violations.get("self_harm.general", 0) > 0.8
49
-
bot inform cannot engage in self harm behavior
67
+
if $config.enable_rails_exceptions
68
+
create event ActiveFenceSelfHarmRailException(message="ActiveFence moderation triggered. The self harm risk score exceeded the threshold.")
69
+
else
70
+
bot inform cannot engage in self harm behavior
50
71
stop
51
72
52
73
if $result.violations.get("adult_content.general", 0) > 0.3
53
-
bot inform cannot engage with inappropriate content
74
+
if $config.enable_rails_exceptions
75
+
create event ActiveFenceAdultContentRailException(message="ActiveFence moderation triggered. The adult content risk score exceeded the threshold.")
76
+
else
77
+
bot inform cannot engage with inappropriate content
54
78
stop
55
79
56
80
if $result.violations.get("privacy_violation.pii", 0) > 0.8
57
-
bot inform cannot engage with sensitive content
81
+
if $config.enable_rails_exceptions
82
+
create event ActiveFencePrivacyViolationRailException(message="ActiveFence moderation triggered. The privacy violation risk score exceeded the threshold.")
83
+
else
84
+
bot inform cannot engage with sensitive content
58
85
stop
59
86
60
-
61
87
define bot inform cannot engage in abusive or harmful behavior
62
88
"I will not engage in any abusive or harmful behavior."
create event JailbreakDetectionRailException(message="Jailbreak attempt detected. The user's prompt was identified as an attempted jailbreak. Please ensure your prompt adheres to the guidelines.")
create event LlamaGuardInputRailException(message="Input not allowed. The input was blocked by the 'llama guard check input' flow. Please ensure your input meets the required criteria.")
create event LlamaGuardOutputRailException(message="Output not allowed. The output was blocked by the 'llama guard check output' flow. Please ensure your output meets the required criteria.")
0 commit comments