From 328aa21a1f1320b407d36e29c9de7434f580b7e8 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Mon, 5 May 2025 21:26:38 -0700 Subject: [PATCH 01/17] add basic reporting flow, sends to mod channel with color coded priority --- DiscordBot/bot.py | 25 +------ DiscordBot/report.py | 173 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 149 insertions(+), 49 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index ec5dddb6..f7370511 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -6,7 +6,7 @@ import logging import re import requests -from report import Report +from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory import pdb # Set up logging to the console @@ -103,29 +103,6 @@ async def handle_channel_message(self, message): if not message.channel.name == f'group-{self.group_num}': return - # Forward the message to the mod channel - mod_channel = self.mod_channels[message.guild.id] - await mod_channel.send(f'Forwarded message:\n{message.author.name}: "{message.content}"') - scores = self.eval_text(message.content) - await mod_channel.send(self.code_format(scores)) - - - def eval_text(self, message): - '''' - TODO: Once you know how you want to evaluate messages in your channel, - insert your code here! This will primarily be used in Milestone 3. - ''' - return message - - - def code_format(self, text): - '''' - TODO: Once you know how you want to show that a message has been - evaluated, insert your code here for formatting the string to be - shown in the mod channel. - ''' - return "Evaluated: '" + text+ "'" - client = ModBot() client.run(discord_token) \ No newline at end of file diff --git a/DiscordBot/report.py b/DiscordBot/report.py index d2bba994..155eecc9 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -6,8 +6,56 @@ class State(Enum): REPORT_START = auto() AWAITING_MESSAGE = auto() MESSAGE_IDENTIFIED = auto() + AWAITING_ABUSE_TYPE = auto() + AWAITING_MISINFO_CATEGORY = auto() + AWAITING_HEALTH_CATEGORY = auto() + AWAITING_NEWS_CATEGORY = auto() REPORT_COMPLETE = auto() +class AbuseType(Enum): + BULLYING = "bullying" + SUICIDE = "suicide/self-harm" + EXPLICIT = "sexually explicit/nudity" + MISINFORMATION = "misinformation" + HATE = "hate speech" + DANGER = "danger" + +SUICIDE_VARIANTS = { + "suicide", + "self harm", + "self-harm", + "selfharm", + "suicide/self harm", + "suicide/selfharm", + "suicide/self-harm", +} + +EXPLICIT_VARIANTS = { + "explicit", + "sexually explicit", + "sexual", + "nudity", + "nude", + "sexually explicit/nudity", +} + +class MisinfoCategory(Enum): + HEALTH = "health" + ADVERTISEMENT = "advertisement" + NEWS = "news" + +class HealthCategory(Enum): + EMERGENCY = "emergency" + MEDICAL_RESEARCH = "medical research" + REPRODUCTIVE = "reproductive healthcare" + TREATMENTS = "treatments" + ALTERNATIVE = "alternative medicine" + +class NewsCategory(Enum): + HISTORICAL = "historical" + POLITICAL = "political" + SCIENCE = "science" + class Report: START_KEYWORD = "report" CANCEL_KEYWORD = "cancel" @@ -17,28 +65,24 @@ def __init__(self, client): self.state = State.REPORT_START self.client = client self.message = None - - async def handle_message(self, message): - ''' - This function makes up the meat of the user-side reporting flow. It defines how we transition between states and what - prompts to offer at each of those states. You're welcome to change anything you want; this skeleton is just here to - get you started and give you a model for working with Discord. - ''' + self.abuse_type = None + self.misinfo_category = None + self.specific_category = None - if message.content == self.CANCEL_KEYWORD: + async def handle_message(self, message): + if message.content.lower() == self.CANCEL_KEYWORD: self.state = State.REPORT_COMPLETE return ["Report cancelled."] - + if self.state == State.REPORT_START: - reply = "Thank you for starting the reporting process. " + reply = "Thank you for starting the reporting process. " reply += "Say `help` at any time for more information.\n\n" reply += "Please copy paste the link to the message you want to report.\n" reply += "You can obtain this link by right-clicking the message and clicking `Copy Message Link`." self.state = State.AWAITING_MESSAGE return [reply] - + if self.state == State.AWAITING_MESSAGE: - # Parse out the three ID strings from the message link m = re.search('/(\d+)/(\d+)/(\d+)', message.content) if not m: return ["I'm sorry, I couldn't read that link. Please try again or say `cancel` to cancel."] @@ -49,24 +93,103 @@ async def handle_message(self, message): if not channel: return ["It seems this channel was deleted or never existed. Please try again or say `cancel` to cancel."] try: - message = await channel.fetch_message(int(m.group(3))) + self.message = await channel.fetch_message(int(m.group(3))) except discord.errors.NotFound: return ["It seems this message was deleted or never existed. Please try again or say `cancel` to cancel."] + + self.state = State.AWAITING_ABUSE_TYPE + reply = "What type of abuse would you like to report?\n" + reply += "• BULLYING\n" + reply += "• SUICIDE/SELF-HARM\n" + reply += "• SEXUALLY EXPLICIT/NUDITY\n" + reply += "• MISINFORMATION\n" + reply += "• HATE SPEECH\n" + reply += "• DANGER" + return ["I found this message:", "```" + self.message.author.name + ": " + self.message.content + "```", reply] - # Here we've found the message - it's up to you to decide what to do next! - self.state = State.MESSAGE_IDENTIFIED - return ["I found this message:", "```" + message.author.name + ": " + message.content + "```", \ - "This is all I know how to do right now - it's up to you to build out the rest of my reporting flow!"] - - if self.state == State.MESSAGE_IDENTIFIED: - return [""] + if self.state == State.AWAITING_ABUSE_TYPE: + abuse_type = message.content.lower() + if abuse_type in SUICIDE_VARIANTS: + self.abuse_type = AbuseType.SUICIDE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"🔴 URGENT - SUICIDE/SELF-HARM REPORT:\n{self.message.author.name}: {self.message.content}") + self.state = State.REPORT_COMPLETE + return ["Thank you for reporting. This has been escalated to our moderation team for immediate review."] - return [] + if abuse_type in EXPLICIT_VARIANTS: + self.abuse_type = AbuseType.EXPLICIT + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"🔴 URGENT - EXPLICIT CONTENT REPORT:\n{self.message.author.name}: {self.message.content}") + self.state = State.REPORT_COMPLETE + return ["Thank you for reporting. This has been escalated to our moderation team for immediate review."] - def report_complete(self): - return self.state == State.REPORT_COMPLETE - + for type in AbuseType: + if abuse_type == type.value: + self.abuse_type = type + if type == AbuseType.MISINFORMATION: + self.state = State.AWAITING_MISINFO_CATEGORY + return ["Please select the misinformation category:\n• HEALTH\n• ADVERTISEMENT\n• NEWS"] + else: + mod_channel = self.client.mod_channels[self.message.guild.id] + priority = "🔴" if type in [AbuseType.HATE, AbuseType.DANGER] else "🟡" + await mod_channel.send(f"{priority} New report - {type.value.upper()}:\n{self.message.author.name}: {self.message.content}") + self.state = State.REPORT_COMPLETE + return ["Thank you for reporting, it has been sent to our moderation team."] + return ["Please select a valid abuse type from the list above."] + if self.state == State.AWAITING_MISINFO_CATEGORY: + category = message.content.lower() + for cat in MisinfoCategory: + if category == cat.value: + self.misinfo_category = cat + if cat == MisinfoCategory.HEALTH: + self.state = State.AWAITING_HEALTH_CATEGORY + return ["Please specify the health misinformation category:\n• EMERGENCY\n• MEDICAL RESEARCH\n• REPRODUCTIVE HEALTHCARE\n• TREATMENTS\n• ALTERNATIVE MEDICINE"] + elif cat == MisinfoCategory.NEWS: + self.state = State.AWAITING_NEWS_CATEGORY + return ["Please specify the news category:\n• HISTORICAL\n• POLITICAL\n• SCIENCE"] + else: # Advertisement + self.state = State.REPORT_COMPLETE + await self.client.mod_channels[self.message.guild.id].send(f"🟡 ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") + return ["This has been reported to our ad team."] + return ["Please select a valid misinformation category from the list above."] - + if self.state == State.AWAITING_HEALTH_CATEGORY: + health_cat = message.content.lower() + for cat in HealthCategory: + if health_cat == cat.value: + self.specific_category = cat + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + + if cat == HealthCategory.EMERGENCY: + await mod_channel.send(f"🔴 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") + return ["We will prioritize this and send it for review."] + elif cat in [HealthCategory.MEDICAL_RESEARCH, HealthCategory.REPRODUCTIVE]: + await mod_channel.send(f"🟡 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") + return ["This has been sent to moderators."] + else: + await mod_channel.send(f"🟢 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") + return ["This has been sent to our team. Review if necessary, marked with non-scientific flag."] + return ["Please select a valid health category from the list above."] + if self.state == State.AWAITING_NEWS_CATEGORY: + news_cat = message.content.lower() + for cat in NewsCategory: + if news_cat == cat.value: + self.specific_category = cat + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + + if cat == NewsCategory.POLITICAL: + await mod_channel.send(f"🟡 NEWS MISINFO:\n{self.message.author.name}: {self.message.content}") + else: + await mod_channel.send(f"🟢 NEWS MISINFO:\n{self.message.author.name}: {self.message.content}") + return ["This has been sent to our team."] + return ["Please select a valid news category from the list above."] + + return [] + + def report_complete(self): + """Returns whether the current report is in a completed state""" + return self.state == State.REPORT_COMPLETE \ No newline at end of file From d8000e3e7c9583218e8b92dd9bced6fbfa5ad157 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Thu, 8 May 2025 18:57:23 -0700 Subject: [PATCH 02/17] remove level from user reporting, add moderator reporting flow --- DiscordBot/bot.py | 151 ++++++++++++++++++++++++++++++++++++++++++- DiscordBot/report.py | 61 ++++++++++------- 2 files changed, 188 insertions(+), 24 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index f7370511..89b280c5 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -34,6 +34,7 @@ def __init__(self): self.group_num = None self.mod_channels = {} # Map from guild to the mod channel id for that guild self.reports = {} # Map from user IDs to the state of their report + self.active_mod_flow = None # State for the current moderation flow async def on_ready(self): print(f'{self.user.name} has connected to Discord! It is these guilds:') @@ -99,10 +100,156 @@ async def handle_dm(self, message): self.reports.pop(author_id) async def handle_channel_message(self, message): - # Only handle messages sent in the "group-#" channel - if not message.channel.name == f'group-{self.group_num}': + # Only handle messages sent in the "group-#-mod" channel + if message.channel.name == f'group-{self.group_num}-mod': + await self.handle_mod_channel_message(message) + elif message.channel.name == f'group-{self.group_num}': return + async def start_moderation_flow(self, report_type, report_content, message_author, message_link=None): + # Determine the initial step based on report type + if report_type.startswith('ADVERTISING MISINFO'): + initial_step = 'advertising_done' + elif report_type.startswith('MISINFORMATION') or report_type.startswith('HEALTH MISINFO') or report_type.startswith('NEWS MISINFO'): + initial_step = 'danger_level' + else: + initial_step = 'default_done' + self.active_mod_flow = { + 'step': initial_step, + 'report_type': report_type, + 'report_content': report_content, + 'message_author': message_author, + 'message_link': message_link, + 'context': {} + } + mod_channel = None + for channel in self.mod_channels.values(): + mod_channel = channel + break + if mod_channel: + await mod_channel.send(f"A new report has been submitted:\nType: {report_type}\nContent: {report_content}\nReported user: {message_author}") + if initial_step == 'danger_level': + await mod_channel.send("What is the level of danger for this report?\n• LOW\n• MEDIUM\n• HIGH") + elif initial_step == 'advertising_done': + await mod_channel.send("Report sent to advertising team. No further action required.") + self.active_mod_flow = None + elif initial_step == 'default_done': + # Just show the report, do not prompt for reply + self.active_mod_flow = None + else: + await self.prompt_next_moderation_step(mod_channel) + + async def notify_reported_user(self, user_name, guild, outcome, explanation=None): + # Find the user object by name in the guild + user = discord.utils.get(guild.members, name=user_name) + if user: + try: + msg = f"Your message was reviewed by moderators. Outcome: {outcome}." + if explanation: + msg += f"\nReason: {explanation}" + msg += "\nIf you believe this was a mistake, you may reply to this message to appeal." + await user.send(msg) + except Exception as e: + print(f"Failed to DM user {user_name}: {e}") + + async def handle_mod_channel_message(self, message): + if not self.active_mod_flow: + return + step = self.active_mod_flow['step'] + ctx = self.active_mod_flow['context'] + content = message.content.strip().lower() + mod_channel = message.channel + report_type = self.active_mod_flow['report_type'] + report_content = self.active_mod_flow['report_content'] + reported_user_name = self.active_mod_flow['message_author'] + guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None + + # Misinformation moderation flow + if step == 'advertising_done': + # Already handled + self.active_mod_flow = None + return + if step == 'danger_level': + if content not in ['low', 'medium', 'high']: + await mod_channel.send("Invalid option. Please choose:\n• LOW\n• MEDIUM\n• HIGH") + return + ctx['danger_level'] = content + if content == 'low': + await mod_channel.send("Flag post as low danger. After claim is investigated, what action should be taken on post?\n• DO NOT RECOMMEND\n• FLAG AS UNPROVEN") + self.active_mod_flow['step'] = 'low_action_on_post' + return + elif content == 'medium': + await mod_channel.send("Flag post as medium danger. After claim is investigated, what action should be taken on post?\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") + self.active_mod_flow['step'] = 'medium_action_on_post' + return + elif content == 'high': + await mod_channel.send("Flag post as high danger. What emergency action should be taken based on post?\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") + self.active_mod_flow['step'] = 'high_action_on_post' + return + if step == 'low_action_on_post': + if content == 'do not recommend': + await mod_channel.send("Post will not be recommended. Action recorded. (Update algorithm so post is not recommended.)") + await self.notify_reported_user(reported_user_name, guild, outcome="Post not recommended.") + self.active_mod_flow = None + return + elif content == 'flag as unproven': + await mod_channel.send("Post will be flagged as unproven/non-scientific. Please add explanation for why post is being flagged.") + self.active_mod_flow['step'] = 'flag_explanation' + return + else: + await mod_channel.send("Invalid option. Please choose:\n• DO NOT RECOMMEND\n• FLAG AS UNPROVEN") + return + if step == 'flag_explanation': + await mod_channel.send(f"Explanation recorded: {message.content}\nFlagged post as not proven.") + await self.notify_reported_user(reported_user_name, guild, outcome="Post flagged as unproven/non-scientific.", explanation=message.content) + self.active_mod_flow = None + return + if step == 'medium_action_on_post' or step == 'high_action_on_post': + if content == 'remove': + await mod_channel.send("Post will be removed. Please add explanation for why post is being removed.") + self.active_mod_flow['step'] = 'remove_explanation' + return + elif content == 'raise': + await mod_channel.send("Raising to higher level moderator. Report sent to higher level moderators.") + self.active_mod_flow = None + return + elif content == 'report to authorities': + await mod_channel.send("Reporting to authorities. Report sent to authorities.") + self.active_mod_flow = None + return + else: + await mod_channel.send("Invalid option. Please choose:\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") + return + if step == 'remove_explanation': + await mod_channel.send(f"Explanation recorded: {message.content}\nPost removed. What action should be taken on the creator of the post?\n• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER") + ctx['remove_explanation'] = message.content + await self.notify_reported_user( + reported_user_name, + guild, + outcome="Post removed.", + explanation=ctx.get('remove_explanation', '') + ) + self.active_mod_flow['step'] = 'action_on_user' + return + if step == 'action_on_user': + if content == 'record incident': + await mod_channel.send("Incident recorded for internal use. (Add to internal incident count for user.)") + self.active_mod_flow = None + return + elif content == 'temporarily mute': + await mod_channel.send("User will be muted for 24 hours.") + self.active_mod_flow = None + return + elif content == 'remove user': + await mod_channel.send("User will be removed.") + self.active_mod_flow = None + return + else: + await mod_channel.send("Invalid option. Please choose:\n• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER") + return + + async def prompt_next_moderation_step(self, mod_channel): + await mod_channel.send("Moderator, please review the report and respond with your decision.") client = ModBot() client.run(discord_token) \ No newline at end of file diff --git a/DiscordBot/report.py b/DiscordBot/report.py index 155eecc9..fb510ab0 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -112,16 +112,26 @@ async def handle_message(self, message): if abuse_type in SUICIDE_VARIANTS: self.abuse_type = AbuseType.SUICIDE mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"🔴 URGENT - SUICIDE/SELF-HARM REPORT:\n{self.message.author.name}: {self.message.content}") + await mod_channel.send(f"SUICIDE/SELF-HARM REPORT:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type="SUICIDE/SELF-HARM", + report_content=self.message.content, + message_author=self.message.author.name + ) self.state = State.REPORT_COMPLETE - return ["Thank you for reporting. This has been escalated to our moderation team for immediate review."] + return ["Thank you for reporting. This has been sent to our moderation team for review."] if abuse_type in EXPLICIT_VARIANTS: self.abuse_type = AbuseType.EXPLICIT mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"🔴 URGENT - EXPLICIT CONTENT REPORT:\n{self.message.author.name}: {self.message.content}") + await mod_channel.send(f"EXPLICIT CONTENT REPORT:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type="EXPLICIT CONTENT", + report_content=self.message.content, + message_author=self.message.author.name + ) self.state = State.REPORT_COMPLETE - return ["Thank you for reporting. This has been escalated to our moderation team for immediate review."] + return ["Thank you for reporting. This has been sent to our moderation team for review."] for type in AbuseType: if abuse_type == type.value: @@ -131,8 +141,12 @@ async def handle_message(self, message): return ["Please select the misinformation category:\n• HEALTH\n• ADVERTISEMENT\n• NEWS"] else: mod_channel = self.client.mod_channels[self.message.guild.id] - priority = "🔴" if type in [AbuseType.HATE, AbuseType.DANGER] else "🟡" - await mod_channel.send(f"{priority} New report - {type.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await mod_channel.send(f"New report - {type.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=type.value.upper(), + report_content=self.message.content, + message_author=self.message.author.name + ) self.state = State.REPORT_COMPLETE return ["Thank you for reporting, it has been sent to our moderation team."] return ["Please select a valid abuse type from the list above."] @@ -150,7 +164,12 @@ async def handle_message(self, message): return ["Please specify the news category:\n• HISTORICAL\n• POLITICAL\n• SCIENCE"] else: # Advertisement self.state = State.REPORT_COMPLETE - await self.client.mod_channels[self.message.guild.id].send(f"🟡 ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") + await self.client.mod_channels[self.message.guild.id].send(f"ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type="ADVERTISING MISINFO", + report_content=self.message.content, + message_author=self.message.author.name + ) return ["This has been reported to our ad team."] return ["Please select a valid misinformation category from the list above."] @@ -161,16 +180,13 @@ async def handle_message(self, message): self.specific_category = cat self.state = State.REPORT_COMPLETE mod_channel = self.client.mod_channels[self.message.guild.id] - - if cat == HealthCategory.EMERGENCY: - await mod_channel.send(f"🔴 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") - return ["We will prioritize this and send it for review."] - elif cat in [HealthCategory.MEDICAL_RESEARCH, HealthCategory.REPRODUCTIVE]: - await mod_channel.send(f"🟡 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") - return ["This has been sent to moderators."] - else: - await mod_channel.send(f"🟢 HEALTH MISINFO:\n{self.message.author.name}: {self.message.content}") - return ["This has been sent to our team. Review if necessary, marked with non-scientific flag."] + await mod_channel.send(f"HEALTH MISINFO - {cat.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"HEALTH MISINFO - {cat.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been sent to our moderation team."] return ["Please select a valid health category from the list above."] if self.state == State.AWAITING_NEWS_CATEGORY: @@ -180,11 +196,12 @@ async def handle_message(self, message): self.specific_category = cat self.state = State.REPORT_COMPLETE mod_channel = self.client.mod_channels[self.message.guild.id] - - if cat == NewsCategory.POLITICAL: - await mod_channel.send(f"🟡 NEWS MISINFO:\n{self.message.author.name}: {self.message.content}") - else: - await mod_channel.send(f"🟢 NEWS MISINFO:\n{self.message.author.name}: {self.message.content}") + await mod_channel.send(f"NEWS MISINFO - {cat.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"NEWS MISINFO - {cat.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) return ["This has been sent to our team."] return ["Please select a valid news category from the list above."] From 45dca1556d153a67c8bc95f78c67e4cf1eb762c7 Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Fri, 9 May 2025 01:21:24 -0700 Subject: [PATCH 03/17] Add appeals process --- DiscordBot/bot.py | 145 +++++++++++++++++++++++++++++++++++++++---- DiscordBot/report.py | 17 +++++ 2 files changed, 151 insertions(+), 11 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 89b280c5..6da724cf 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -6,7 +6,7 @@ import logging import re import requests -from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory +from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory, State import pdb # Set up logging to the console @@ -30,10 +30,12 @@ class ModBot(discord.Client): def __init__(self): intents = discord.Intents.default() intents.message_content = True + intents.members = True super().__init__(command_prefix='.', intents=intents) self.group_num = None self.mod_channels = {} # Map from guild to the mod channel id for that guild self.reports = {} # Map from user IDs to the state of their report + self.pending_appeals = {} self.active_mod_flow = None # State for the current moderation flow async def on_ready(self): @@ -72,6 +74,47 @@ async def on_message(self, message): await self.handle_dm(message) async def handle_dm(self, message): + if message.author.id in self.pending_appeals: + # Retrieve all pending appeals for the user + user_appeals = self.pending_appeals[message.author.id] + if not user_appeals: + return + + # Process the first pending appeal + info = user_appeals.pop(0) + if not user_appeals: + # Remove the user from pending_appeals if no appeals remain + del self.pending_appeals[message.author.id] + + mod_chan = self.mod_channels[info['guild_id']] + + # Build the appeal notice + text = ( + f"APPEAL RECEIVED:\n" + f"User: {info['reported_name']}\n" + f"Outcome: {info['outcome']}\n\n" + f"Original Message:\n{info['original_message']}" + ) + if info.get('explanation'): + text += f"\n\nReason: {info['explanation']}" + text += f"\n\nAppeal Reason:\n{message.content}" + + # Send to mod channel + await mod_chan.send(text) + + # Prompt mods for ACCEPT/UPHOLD + self.active_mod_flow = { + 'step': 'appeal_review', + 'message_author': info['reported_name'], + 'context': {}, + 'guild_id': info['guild_id'] + } + await mod_chan.send("Moderators, please respond with:\n• ACCEPT\n• UPHOLD") + + # Acknowledge to user + await message.channel.send("Your appeal has been submitted and is under review.") + return + # Handle a help message if message.content == Report.HELP_KEYWORD: reply = "Use the `report` command to begin the reporting process.\n" @@ -90,7 +133,7 @@ async def handle_dm(self, message): if author_id not in self.reports: self.reports[author_id] = Report(self) - # Let the report class handle this message; forward all the messages it returns to uss + # Let the report class handle this message; forward all the messages it returns to us responses = await self.reports[author_id].handle_message(message) for r in responses: await message.channel.send(r) @@ -139,30 +182,65 @@ async def start_moderation_flow(self, report_type, report_content, message_autho else: await self.prompt_next_moderation_step(mod_channel) - async def notify_reported_user(self, user_name, guild, outcome, explanation=None): - # Find the user object by name in the guild + async def notify_reported_user(self, user_name, guild, outcome, explanation=None, original_message=None): + """Notify the user about the outcome and provide an appeal option.""" user = discord.utils.get(guild.members, name=user_name) if user: try: msg = f"Your message was reviewed by moderators. Outcome: {outcome}." + if original_message: + msg += f"\n\n**Original Message:**\n{original_message}" if explanation: - msg += f"\nReason: {explanation}" - msg += "\nIf you believe this was a mistake, you may reply to this message to appeal." + msg += f"\n\n**Reason:** {explanation}" + msg += "\n\nIf you believe this was a mistake, you may reply to this message to appeal." await user.send(msg) except Exception as e: print(f"Failed to DM user {user_name}: {e}") + async def notify_user_of_appeal_option(self, user_name, guild, explanation): + """Notify the user about the appeal process after their post is removed.""" + user = discord.utils.get(guild.members, name=user_name) + if user: + try: + msg = f"Your post was removed for the following reason: {explanation}.\n" + msg += "If you believe this was a mistake, you can appeal by replying with your reason." + await user.send(msg) + except Exception as e: + print(f"Failed to notify user {user_name}: {e}") + async def handle_mod_channel_message(self, message): if not self.active_mod_flow: return step = self.active_mod_flow['step'] - ctx = self.active_mod_flow['context'] content = message.content.strip().lower() mod_channel = message.channel + guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None + + if step == 'appeal_review': + if content == 'accept': + await mod_channel.send("The appeal has been accepted. The original decision has been overturned.") + user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) + if user: + await user.send("Your appeal has been accepted. The original decision has been overturned.") + self.active_mod_flow = None + return + + elif content == 'uphold': + await mod_channel.send("The appeal has been reviewed and the original decision is upheld.") + user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) + if user: + await user.send("Your appeal has been reviewed, and the original decision is upheld.") + self.active_mod_flow = None + return + + else: + await mod_channel.send("Invalid response. Please respond with:\n• ACCEPT\n• UPHOLD") + return + + ctx = self.active_mod_flow['context'] report_type = self.active_mod_flow['report_type'] report_content = self.active_mod_flow['report_content'] reported_user_name = self.active_mod_flow['message_author'] - guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None # Misinformation moderation flow if step == 'advertising_done': @@ -221,13 +299,32 @@ async def handle_mod_channel_message(self, message): await mod_channel.send("Invalid option. Please choose:\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") return if step == 'remove_explanation': - await mod_channel.send(f"Explanation recorded: {message.content}\nPost removed. What action should be taken on the creator of the post?\n• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER") - ctx['remove_explanation'] = message.content + explanation = message.content + ctx['remove_explanation'] = explanation await self.notify_reported_user( reported_user_name, guild, outcome="Post removed.", - explanation=ctx.get('remove_explanation', '') + explanation=ctx.get('remove_explanation', ''), + original_message=report_content + ) + # Send only the appeal prompt + user = discord.utils.get(guild.members, name=reported_user_name) + if user: + # Track for incoming DM in pending_appeals + if user.id not in self.pending_appeals: + self.pending_appeals[user.id] = [] + self.pending_appeals[user.id].append({ + 'guild_id': guild.id, + 'reported_name': reported_user_name, + 'outcome': "Post removed.", + 'original_message': report_content, + 'explanation': explanation + }) + await mod_channel.send( + f"Explanation recorded: {explanation}\n" + "What action should be taken on the creator of the post?\n" + "• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER" ) self.active_mod_flow['step'] = 'action_on_user' return @@ -238,10 +335,36 @@ async def handle_mod_channel_message(self, message): return elif content == 'temporarily mute': await mod_channel.send("User will be muted for 24 hours.") + await self.notify_reported_user( + reported_user_name, + guild, + outcome="You have been temporarily muted.", + explanation="You violated the community guidelines.", + original_message=report_content + ) self.active_mod_flow = None return elif content == 'remove user': await mod_channel.send("User will be removed.") + await self.notify_reported_user( + reported_user_name, + guild, + outcome="You have been removed from the server.", + explanation="You violated the community guidelines.", + original_message=report_content + ) + user = discord.utils.get(guild.members, name=reported_user_name) + if user: + # Track for incoming DM in pending_appeals + if user.id not in self.pending_appeals: + self.pending_appeals[user.id] = [] + self.pending_appeals[user.id].append({ + 'guild_id': guild.id, + 'reported_name': reported_user_name, + 'outcome': "You have been removed from the server.", + 'original_message': report_content, + 'explanation': "You violated the community guidelines." + }) self.active_mod_flow = None return else: diff --git a/DiscordBot/report.py b/DiscordBot/report.py index fb510ab0..efb16d72 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -11,6 +11,8 @@ class State(Enum): AWAITING_HEALTH_CATEGORY = auto() AWAITING_NEWS_CATEGORY = auto() REPORT_COMPLETE = auto() + AWAITING_APPEAL = auto() + APPEAL_REVIEW = auto() class AbuseType(Enum): BULLYING = "bullying" @@ -207,6 +209,21 @@ async def handle_message(self, message): return [] + async def notify_reported_user(self, user_name, guild, outcome, explanation=None): + # Find the user object by name in the guild + user = discord.utils.get(guild.members, name=user_name) + if user: + try: + msg = f"Your message was reviewed by moderators. Outcome: {outcome}." + if explanation: + msg += f"\nReason: {explanation}" + msg += "\nIf you believe this was a mistake, you may reply to this message to appeal." + await user.send(msg) + if outcome == "Post removed.": + await self.notify_user_of_appeal_option(user_name, guild, explanation) + except Exception as e: + print(f"Failed to DM user {user_name}: {e}") + def report_complete(self): """Returns whether the current report is in a completed state""" return self.state == State.REPORT_COMPLETE \ No newline at end of file From 9020a0d09c5ca98d2ce5f569353999f7ec9ab6d5 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Tue, 27 May 2025 19:20:41 -0700 Subject: [PATCH 04/17] added per-user statistics tracking in user_stats.json, changed to numerical inputs --- DiscordBot/bot.py | 112 +++++++++++++++++------ DiscordBot/report.py | 181 +++++++++++++++++++++---------------- DiscordBot/user_stats.json | 21 +++++ DiscordBot/user_stats.py | 53 +++++++++++ 4 files changed, 263 insertions(+), 104 deletions(-) create mode 100644 DiscordBot/user_stats.json create mode 100644 DiscordBot/user_stats.py diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 89b280c5..afaa8656 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -7,6 +7,7 @@ import re import requests from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory +from user_stats import UserStats import pdb # Set up logging to the console @@ -30,11 +31,13 @@ class ModBot(discord.Client): def __init__(self): intents = discord.Intents.default() intents.message_content = True + intents.members = True # Add members intent super().__init__(command_prefix='.', intents=intents) self.group_num = None self.mod_channels = {} # Map from guild to the mod channel id for that guild self.reports = {} # Map from user IDs to the state of their report self.active_mod_flow = None # State for the current moderation flow + self.user_stats = UserStats() # Initialize user statistics tracking async def on_ready(self): print(f'{self.user.name} has connected to Discord! It is these guilds:') @@ -129,7 +132,7 @@ async def start_moderation_flow(self, report_type, report_content, message_autho if mod_channel: await mod_channel.send(f"A new report has been submitted:\nType: {report_type}\nContent: {report_content}\nReported user: {message_author}") if initial_step == 'danger_level': - await mod_channel.send("What is the level of danger for this report?\n• LOW\n• MEDIUM\n• HIGH") + await mod_channel.send("What is the level of danger for this report?\n1. LOW\n2. MEDIUM\n3. HIGH") elif initial_step == 'advertising_done': await mod_channel.send("Report sent to advertising team. No further action required.") self.active_mod_flow = None @@ -164,64 +167,96 @@ async def handle_mod_channel_message(self, message): reported_user_name = self.active_mod_flow['message_author'] guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None + # Get the user ID from the reported user's name + reported_user = discord.utils.get(guild.members, name=reported_user_name) + if not reported_user: + await mod_channel.send(f"Could not find user {reported_user_name}. Please verify the username is correct.") + return + # Misinformation moderation flow if step == 'advertising_done': # Already handled self.active_mod_flow = None return if step == 'danger_level': - if content not in ['low', 'medium', 'high']: - await mod_channel.send("Invalid option. Please choose:\n• LOW\n• MEDIUM\n• HIGH") + if content not in ['1', '2', '3']: + await mod_channel.send("Invalid option. Please choose:\n1. LOW\n2. MEDIUM\n3. HIGH") return - ctx['danger_level'] = content - if content == 'low': - await mod_channel.send("Flag post as low danger. After claim is investigated, what action should be taken on post?\n• DO NOT RECOMMEND\n• FLAG AS UNPROVEN") + danger_levels = {'1': 'low', '2': 'medium', '3': 'high'} + ctx['danger_level'] = danger_levels[content] + if content == '1': # LOW + await mod_channel.send("Flag post as low danger. After claim is investigated, what action should be taken on post?\n1. DO NOT RECOMMEND\n2. FLAG AS UNPROVEN") self.active_mod_flow['step'] = 'low_action_on_post' return - elif content == 'medium': - await mod_channel.send("Flag post as medium danger. After claim is investigated, what action should be taken on post?\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") + elif content == '2': # MEDIUM + await mod_channel.send("Flag post as medium danger. After claim is investigated, what action should be taken on post?\n1. REMOVE\n2. RAISE\n3. REPORT TO AUTHORITIES") self.active_mod_flow['step'] = 'medium_action_on_post' return - elif content == 'high': - await mod_channel.send("Flag post as high danger. What emergency action should be taken based on post?\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") + elif content == '3': # HIGH + await mod_channel.send("Flag post as high danger. What emergency action should be taken based on post?\n1. REMOVE\n2. RAISE\n3. REPORT TO AUTHORITIES") self.active_mod_flow['step'] = 'high_action_on_post' return if step == 'low_action_on_post': - if content == 'do not recommend': + if content not in ['1', '2']: + await mod_channel.send("Invalid option. Please choose:\n1. DO NOT RECOMMEND\n2. FLAG AS UNPROVEN") + return + if content == '1': # DO NOT RECOMMEND await mod_channel.send("Post will not be recommended. Action recorded. (Update algorithm so post is not recommended.)") await self.notify_reported_user(reported_user_name, guild, outcome="Post not recommended.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post not recommended" + ) self.active_mod_flow = None return - elif content == 'flag as unproven': + elif content == '2': # FLAG AS UNPROVEN await mod_channel.send("Post will be flagged as unproven/non-scientific. Please add explanation for why post is being flagged.") self.active_mod_flow['step'] = 'flag_explanation' return - else: - await mod_channel.send("Invalid option. Please choose:\n• DO NOT RECOMMEND\n• FLAG AS UNPROVEN") - return if step == 'flag_explanation': await mod_channel.send(f"Explanation recorded: {message.content}\nFlagged post as not proven.") await self.notify_reported_user(reported_user_name, guild, outcome="Post flagged as unproven/non-scientific.", explanation=message.content) + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post flagged as unproven/non-scientific", + message.content + ) self.active_mod_flow = None return if step == 'medium_action_on_post' or step == 'high_action_on_post': - if content == 'remove': + if content not in ['1', '2', '3']: + await mod_channel.send("Invalid option. Please choose:\n1. REMOVE\n2. RAISE\n3. REPORT TO AUTHORITIES") + return + if content == '1': # REMOVE await mod_channel.send("Post will be removed. Please add explanation for why post is being removed.") self.active_mod_flow['step'] = 'remove_explanation' return - elif content == 'raise': + elif content == '2': # RAISE await mod_channel.send("Raising to higher level moderator. Report sent to higher level moderators.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Report raised to higher level moderator" + ) self.active_mod_flow = None return - elif content == 'report to authorities': + elif content == '3': # REPORT TO AUTHORITIES await mod_channel.send("Reporting to authorities. Report sent to authorities.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Reported to authorities" + ) self.active_mod_flow = None return - else: - await mod_channel.send("Invalid option. Please choose:\n• REMOVE\n• RAISE\n• REPORT TO AUTHORITIES") - return if step == 'remove_explanation': - await mod_channel.send(f"Explanation recorded: {message.content}\nPost removed. What action should be taken on the creator of the post?\n• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER") + await mod_channel.send(f"Explanation recorded: {message.content}\nPost removed. What action should be taken on the creator of the post?\n1. RECORD INCIDENT\n2. TEMPORARILY MUTE\n3. REMOVE USER") ctx['remove_explanation'] = message.content await self.notify_reported_user( reported_user_name, @@ -232,21 +267,42 @@ async def handle_mod_channel_message(self, message): self.active_mod_flow['step'] = 'action_on_user' return if step == 'action_on_user': - if content == 'record incident': + if content not in ['1', '2', '3']: + await mod_channel.send("Invalid option. Please choose:\n1. RECORD INCIDENT\n2. TEMPORARILY MUTE\n3. REMOVE USER") + return + if content == '1': # RECORD INCIDENT await mod_channel.send("Incident recorded for internal use. (Add to internal incident count for user.)") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and incident recorded", + ctx.get('remove_explanation', '') + ) self.active_mod_flow = None return - elif content == 'temporarily mute': + elif content == '2': # TEMPORARILY MUTE await mod_channel.send("User will be muted for 24 hours.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and user temporarily muted", + ctx.get('remove_explanation', '') + ) self.active_mod_flow = None return - elif content == 'remove user': + elif content == '3': # REMOVE USER await mod_channel.send("User will be removed.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and user removed", + ctx.get('remove_explanation', '') + ) self.active_mod_flow = None return - else: - await mod_channel.send("Invalid option. Please choose:\n• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER") - return async def prompt_next_moderation_step(self, mod_channel): await mod_channel.send("Moderator, please review the report and respond with your decision.") diff --git a/DiscordBot/report.py b/DiscordBot/report.py index fb510ab0..a67bf932 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -99,18 +99,31 @@ async def handle_message(self, message): self.state = State.AWAITING_ABUSE_TYPE reply = "What type of abuse would you like to report?\n" - reply += "• BULLYING\n" - reply += "• SUICIDE/SELF-HARM\n" - reply += "• SEXUALLY EXPLICIT/NUDITY\n" - reply += "• MISINFORMATION\n" - reply += "• HATE SPEECH\n" - reply += "• DANGER" + reply += "1. BULLYING\n" + reply += "2. SUICIDE/SELF-HARM\n" + reply += "3. SEXUALLY EXPLICIT/NUDITY\n" + reply += "4. MISINFORMATION\n" + reply += "5. HATE SPEECH\n" + reply += "6. DANGER" return ["I found this message:", "```" + self.message.author.name + ": " + self.message.content + "```", reply] if self.state == State.AWAITING_ABUSE_TYPE: - abuse_type = message.content.lower() - if abuse_type in SUICIDE_VARIANTS: - self.abuse_type = AbuseType.SUICIDE + abuse_type = message.content.strip() + abuse_types = { + '1': AbuseType.BULLYING, + '2': AbuseType.SUICIDE, + '3': AbuseType.EXPLICIT, + '4': AbuseType.MISINFORMATION, + '5': AbuseType.HATE, + '6': AbuseType.DANGER + } + + if abuse_type not in abuse_types: + return ["Please select a valid option (1-6) from the list above."] + + self.abuse_type = abuse_types[abuse_type] + + if self.abuse_type == AbuseType.SUICIDE: mod_channel = self.client.mod_channels[self.message.guild.id] await mod_channel.send(f"SUICIDE/SELF-HARM REPORT:\n{self.message.author.name}: {self.message.content}") await self.client.start_moderation_flow( @@ -121,8 +134,7 @@ async def handle_message(self, message): self.state = State.REPORT_COMPLETE return ["Thank you for reporting. This has been sent to our moderation team for review."] - if abuse_type in EXPLICIT_VARIANTS: - self.abuse_type = AbuseType.EXPLICIT + if self.abuse_type == AbuseType.EXPLICIT: mod_channel = self.client.mod_channels[self.message.guild.id] await mod_channel.send(f"EXPLICIT CONTENT REPORT:\n{self.message.author.name}: {self.message.content}") await self.client.start_moderation_flow( @@ -133,77 +145,94 @@ async def handle_message(self, message): self.state = State.REPORT_COMPLETE return ["Thank you for reporting. This has been sent to our moderation team for review."] - for type in AbuseType: - if abuse_type == type.value: - self.abuse_type = type - if type == AbuseType.MISINFORMATION: - self.state = State.AWAITING_MISINFO_CATEGORY - return ["Please select the misinformation category:\n• HEALTH\n• ADVERTISEMENT\n• NEWS"] - else: - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"New report - {type.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=type.value.upper(), - report_content=self.message.content, - message_author=self.message.author.name - ) - self.state = State.REPORT_COMPLETE - return ["Thank you for reporting, it has been sent to our moderation team."] - return ["Please select a valid abuse type from the list above."] + if self.abuse_type == AbuseType.MISINFORMATION: + self.state = State.AWAITING_MISINFO_CATEGORY + return ["Please select the misinformation category:\n1. HEALTH\n2. ADVERTISEMENT\n3. NEWS"] + else: + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"New report - {self.abuse_type.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=self.abuse_type.value.upper(), + report_content=self.message.content, + message_author=self.message.author.name + ) + self.state = State.REPORT_COMPLETE + return ["Thank you for reporting, it has been sent to our moderation team."] if self.state == State.AWAITING_MISINFO_CATEGORY: - category = message.content.lower() - for cat in MisinfoCategory: - if category == cat.value: - self.misinfo_category = cat - if cat == MisinfoCategory.HEALTH: - self.state = State.AWAITING_HEALTH_CATEGORY - return ["Please specify the health misinformation category:\n• EMERGENCY\n• MEDICAL RESEARCH\n• REPRODUCTIVE HEALTHCARE\n• TREATMENTS\n• ALTERNATIVE MEDICINE"] - elif cat == MisinfoCategory.NEWS: - self.state = State.AWAITING_NEWS_CATEGORY - return ["Please specify the news category:\n• HISTORICAL\n• POLITICAL\n• SCIENCE"] - else: # Advertisement - self.state = State.REPORT_COMPLETE - await self.client.mod_channels[self.message.guild.id].send(f"ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type="ADVERTISING MISINFO", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been reported to our ad team."] - return ["Please select a valid misinformation category from the list above."] + category = message.content.strip() + misinfo_categories = { + '1': MisinfoCategory.HEALTH, + '2': MisinfoCategory.ADVERTISEMENT, + '3': MisinfoCategory.NEWS + } + + if category not in misinfo_categories: + return ["Please select a valid option (1-3) from the list above."] + + self.misinfo_category = misinfo_categories[category] + + if self.misinfo_category == MisinfoCategory.HEALTH: + self.state = State.AWAITING_HEALTH_CATEGORY + return ["Please specify the health misinformation category:\n1. EMERGENCY\n2. MEDICAL RESEARCH\n3. REPRODUCTIVE HEALTHCARE\n4. TREATMENTS\n5. ALTERNATIVE MEDICINE"] + elif self.misinfo_category == MisinfoCategory.NEWS: + self.state = State.AWAITING_NEWS_CATEGORY + return ["Please specify the news category:\n1. HISTORICAL\n2. POLITICAL\n3. SCIENCE"] + else: # Advertisement + self.state = State.REPORT_COMPLETE + await self.client.mod_channels[self.message.guild.id].send(f"ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type="ADVERTISING MISINFO", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been reported to our ad team."] if self.state == State.AWAITING_HEALTH_CATEGORY: - health_cat = message.content.lower() - for cat in HealthCategory: - if health_cat == cat.value: - self.specific_category = cat - self.state = State.REPORT_COMPLETE - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"HEALTH MISINFO - {cat.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=f"HEALTH MISINFO - {cat.value.upper()}", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been sent to our moderation team."] - return ["Please select a valid health category from the list above."] + health_cat = message.content.strip() + health_categories = { + '1': HealthCategory.EMERGENCY, + '2': HealthCategory.MEDICAL_RESEARCH, + '3': HealthCategory.REPRODUCTIVE, + '4': HealthCategory.TREATMENTS, + '5': HealthCategory.ALTERNATIVE + } + + if health_cat not in health_categories: + return ["Please select a valid option (1-5) from the list above."] + + self.specific_category = health_categories[health_cat] + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"HEALTH MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"HEALTH MISINFO - {self.specific_category.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been sent to our moderation team."] if self.state == State.AWAITING_NEWS_CATEGORY: - news_cat = message.content.lower() - for cat in NewsCategory: - if news_cat == cat.value: - self.specific_category = cat - self.state = State.REPORT_COMPLETE - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"NEWS MISINFO - {cat.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=f"NEWS MISINFO - {cat.value.upper()}", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been sent to our team."] - return ["Please select a valid news category from the list above."] + news_cat = message.content.strip() + news_categories = { + '1': NewsCategory.HISTORICAL, + '2': NewsCategory.POLITICAL, + '3': NewsCategory.SCIENCE + } + + if news_cat not in news_categories: + return ["Please select a valid option (1-3) from the list above."] + + self.specific_category = news_categories[news_cat] + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"NEWS MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"NEWS MISINFO - {self.specific_category.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been sent to our team."] return [] diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json new file mode 100644 index 00000000..3b250c4a --- /dev/null +++ b/DiscordBot/user_stats.json @@ -0,0 +1,21 @@ +{ + "1364364429797363722": { + "total_reports": 2, + "reports": [ + { + "timestamp": "2025-05-27T19:16:54.138877", + "report_type": "HEALTH MISINFO - TREATMENTS", + "report_content": "this is health misinformation", + "outcome": "Report raised to higher level moderator", + "explanation": null + }, + { + "timestamp": "2025-05-27T19:17:33.416435", + "report_type": "HEALTH MISINFO - TREATMENTS", + "report_content": "this is health misinformation", + "outcome": "Post removed and user temporarily muted", + "explanation": "not true" + } + ] + } +} \ No newline at end of file diff --git a/DiscordBot/user_stats.py b/DiscordBot/user_stats.py new file mode 100644 index 00000000..cc2165f7 --- /dev/null +++ b/DiscordBot/user_stats.py @@ -0,0 +1,53 @@ +import json +import os +from datetime import datetime + +class UserStats: + def __init__(self): + self.stats_file = 'user_stats.json' + # Clear the stats file on initialization + self._clear_stats() + self.stats = self._load_stats() + + def _clear_stats(self): + # Create an empty stats file + with open(self.stats_file, 'w') as f: + json.dump({}, f) + + def _load_stats(self): + if os.path.exists(self.stats_file): + with open(self.stats_file, 'r') as f: + return json.load(f) + return {} + + def _save_stats(self): + with open(self.stats_file, 'w') as f: + json.dump(self.stats, f, indent=2) + + def add_report(self, user_id, report_type, report_content, outcome, explanation=None): + if user_id not in self.stats: + self.stats[user_id] = { + 'total_reports': 0, + 'reports': [] + } + + report = { + 'timestamp': datetime.now().isoformat(), + 'report_type': report_type, + 'report_content': report_content, + 'outcome': outcome, + 'explanation': explanation + } + + self.stats[user_id]['reports'].append(report) + self.stats[user_id]['total_reports'] = len(self.stats[user_id]['reports']) + self._save_stats() + + def get_user_stats(self, user_id): + return self.stats.get(user_id, { + 'total_reports': 0, + 'reports': [] + }) + + def get_all_stats(self): + return self.stats \ No newline at end of file From fb8f305ec3f58df7dd77d1e52bfda6cc04374d93 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Tue, 27 May 2025 19:30:30 -0700 Subject: [PATCH 05/17] stashing changes --- DiscordBot/report.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/DiscordBot/report.py b/DiscordBot/report.py index 45480a1f..72ce422b 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -22,25 +22,6 @@ class AbuseType(Enum): HATE = "hate speech" DANGER = "danger" -SUICIDE_VARIANTS = { - "suicide", - "self harm", - "self-harm", - "selfharm", - "suicide/self harm", - "suicide/selfharm", - "suicide/self-harm", -} - -EXPLICIT_VARIANTS = { - "explicit", - "sexually explicit", - "sexual", - "nudity", - "nude", - "sexually explicit/nudity", -} - class MisinfoCategory(Enum): HEALTH = "health" ADVERTISEMENT = "advertisement" From 07e4031ffea5c5214bf11b30bd28485dbde25aef Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Tue, 27 May 2025 19:44:23 -0700 Subject: [PATCH 06/17] fixed merge errors --- DiscordBot/bot.py | 7 ++++--- DiscordBot/user_stats.json | 13 +++---------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 1eb64035..d0b73086 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -111,7 +111,7 @@ async def handle_dm(self, message): 'context': {}, 'guild_id': info['guild_id'] } - await mod_chan.send("Moderators, please respond with:\n• ACCEPT\n• UPHOLD") + await mod_chan.send("Moderators, please respond with:\n1. ACCEPT\n2. UPHOLD") # Acknowledge to user await message.channel.send("Your appeal has been submitted and is under review.") @@ -236,7 +236,7 @@ async def handle_mod_channel_message(self, message): return else: - await mod_channel.send("Invalid response. Please respond with:\n• ACCEPT\n• UPHOLD") + await mod_channel.send("Invalid response. Please respond with:\n1. ACCEPT\n2. UPHOLD") return ctx = self.active_mod_flow['context'] @@ -358,7 +358,7 @@ async def handle_mod_channel_message(self, message): await mod_channel.send( f"Explanation recorded: {explanation}\n" "What action should be taken on the creator of the post?\n" - "• RECORD INCIDENT\n• TEMPORARILY MUTE\n• REMOVE USER" + "1. RECORD INCIDENT\n2. TEMPORARILY MUTE\n3. REMOVE USER" ) self.active_mod_flow['step'] = 'action_on_user' return @@ -385,6 +385,7 @@ async def handle_mod_channel_message(self, message): report_content, "Post removed and user temporarily muted", ctx.get('remove_explanation', '') + ) await self.notify_reported_user( reported_user_name, guild, diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 3b250c4a..139db9d9 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1,19 +1,12 @@ { "1364364429797363722": { - "total_reports": 2, + "total_reports": 1, "reports": [ { - "timestamp": "2025-05-27T19:16:54.138877", + "timestamp": "2025-05-27T19:39:16.228235", "report_type": "HEALTH MISINFO - TREATMENTS", "report_content": "this is health misinformation", - "outcome": "Report raised to higher level moderator", - "explanation": null - }, - { - "timestamp": "2025-05-27T19:17:33.416435", - "report_type": "HEALTH MISINFO - TREATMENTS", - "report_content": "this is health misinformation", - "outcome": "Post removed and user temporarily muted", + "outcome": "Post removed and incident recorded", "explanation": "not true" } ] From 256866dab69ce928feeff8a3ddab997cb3ae0492 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Tue, 27 May 2025 19:48:46 -0700 Subject: [PATCH 07/17] fixed input for uphold/accept --- DiscordBot/bot.py | 4 ++-- DiscordBot/user_stats.json | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index d0b73086..5097a3de 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -219,7 +219,7 @@ async def handle_mod_channel_message(self, message): guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None if step == 'appeal_review': - if content == 'accept': + if content == '1': await mod_channel.send("The appeal has been accepted. The original decision has been overturned.") user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) if user: @@ -227,7 +227,7 @@ async def handle_mod_channel_message(self, message): self.active_mod_flow = None return - elif content == 'uphold': + elif content == '2': await mod_channel.send("The appeal has been reviewed and the original decision is upheld.") user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) if user: diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 139db9d9..8b4ae325 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -3,10 +3,22 @@ "total_reports": 1, "reports": [ { - "timestamp": "2025-05-27T19:39:16.228235", + "timestamp": "2025-05-27T19:46:25.232435", "report_type": "HEALTH MISINFO - TREATMENTS", "report_content": "this is health misinformation", - "outcome": "Post removed and incident recorded", + "outcome": "Post removed and user removed", + "explanation": "not true" + } + ] + }, + "484531188581793803": { + "total_reports": 1, + "reports": [ + { + "timestamp": "2025-05-27T19:47:11.110317", + "report_type": "HEALTH MISINFO - TREATMENTS", + "report_content": "this is news misinfo political", + "outcome": "Post removed and user removed", "explanation": "not true" } ] From 819de70938b922c6698da7fdfd8b9e8385c784dc Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Sat, 31 May 2025 11:44:06 -0700 Subject: [PATCH 08/17] Fixed appeal --- DiscordBot/bot.py | 119 +++++++++++++++++++++++++------------ DiscordBot/user_stats.json | 24 ++------ 2 files changed, 87 insertions(+), 56 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 5097a3de..212c5acd 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -39,6 +39,8 @@ def __init__(self): self.pending_appeals = {} self.active_mod_flow = None # State for the current moderation flow self.user_stats = UserStats() # Initialize user statistics tracking + self.awaiting_appeal_confirmation = {} + self.awaiting_appeal_reason = {} async def on_ready(self): print(f'{self.user.name} has connected to Discord! It is these guilds:') @@ -82,39 +84,62 @@ async def handle_dm(self, message): if not user_appeals: return - # Process the first pending appeal - info = user_appeals.pop(0) - if not user_appeals: - # Remove the user from pending_appeals if no appeals remain - del self.pending_appeals[message.author.id] - - mod_chan = self.mod_channels[info['guild_id']] + # Check if the user is in the middle of an appeal confirmation + if hasattr(self, 'awaiting_appeal_confirmation') and self.awaiting_appeal_confirmation.get(message.author.id): + if message.content.strip() == '1': # User wants to appeal + await message.channel.send("Please provide your reasoning for appealing:") + self.awaiting_appeal_confirmation[message.author.id] = False + self.awaiting_appeal_reason[message.author.id] = True + return + elif message.content.strip() == '2': # User does not want to appeal + await message.channel.send("Thank you.") + self.awaiting_appeal_confirmation[message.author.id] = False + return + else: + await message.channel.send("Invalid response. Please reply with 1 for Yes or 2 for No.") + return + + # Check if the user is providing their appeal reasoning + if hasattr(self, 'awaiting_appeal_reason') and self.awaiting_appeal_reason.get(message.author.id): + # Process the appeal reasoning + info = user_appeals.pop(0) + if not user_appeals: + # Remove the user from pending_appeals if no appeals remain + del self.pending_appeals[message.author.id] + + mod_chan = self.mod_channels[info['guild_id']] + + # Build the appeal notice + text = ( + f"APPEAL RECEIVED:\n" + f"User: {info['reported_name']}\n" + f"Outcome: {info['outcome']}\n\n" + f"Original Message:\n{info['original_message']}" + ) + if info.get('explanation'): + text += f"\n\nReason: {info['explanation']}" + text += f"\n\nAppeal Reason:\n{message.content}" + + # Send to mod channel + await mod_chan.send(text) + + # Prompt mods for ACCEPT/UPHOLD + self.active_mod_flow = { + 'step': 'appeal_review', + 'message_author': info['reported_name'], + 'context': {}, + 'guild_id': info['guild_id'] + } + await mod_chan.send("Moderators, please respond with:\n1. ACCEPT\n2. UPHOLD") + + # Acknowledge to user + await message.channel.send("Your appeal has been submitted and is under review.") + self.awaiting_appeal_reason[message.author.id] = False + return - # Build the appeal notice - text = ( - f"APPEAL RECEIVED:\n" - f"User: {info['reported_name']}\n" - f"Outcome: {info['outcome']}\n\n" - f"Original Message:\n{info['original_message']}" - ) - if info.get('explanation'): - text += f"\n\nReason: {info['explanation']}" - text += f"\n\nAppeal Reason:\n{message.content}" - - # Send to mod channel - await mod_chan.send(text) - - # Prompt mods for ACCEPT/UPHOLD - self.active_mod_flow = { - 'step': 'appeal_review', - 'message_author': info['reported_name'], - 'context': {}, - 'guild_id': info['guild_id'] - } - await mod_chan.send("Moderators, please respond with:\n1. ACCEPT\n2. UPHOLD") - - # Acknowledge to user - await message.channel.send("Your appeal has been submitted and is under review.") + if not hasattr(self, 'awaiting_appeal_confirmation'): + self.awaiting_appeal_confirmation = {} + self.awaiting_appeal_confirmation[message.author.id] = True return # Handle a help message @@ -189,13 +214,31 @@ async def notify_reported_user(self, user_name, guild, outcome, explanation=None user = discord.utils.get(guild.members, name=user_name) if user: try: - msg = f"Your message was reviewed by moderators. Outcome: {outcome}." - if original_message: - msg += f"\n\n**Original Message:**\n{original_message}" - if explanation: - msg += f"\n\n**Reason:** {explanation}" - msg += "\n\nIf you believe this was a mistake, you may reply to this message to appeal." + msg = ( + f"Your message was reviewed by moderators. Outcome: {outcome}.\n\n" + f"Original Message:\n{original_message}\n\n" + f"Reason: {explanation}\n\n" + "If you believe this was a mistake, you may reply to this message to appeal. " + "Would you like to appeal this decision?\n1. Yes\n2. No" + ) await user.send(msg) + + # Track pending appeal + if user.id not in self.pending_appeals: + self.pending_appeals[user.id] = [] + self.pending_appeals[user.id].append({ + 'guild_id': guild.id, + 'reported_name': user_name, + 'outcome': outcome, + 'original_message': original_message, + 'explanation': explanation + }) + + # Initialize appeal confirmation state + if not hasattr(self, 'awaiting_appeal_confirmation'): + self.awaiting_appeal_confirmation = {} + self.awaiting_appeal_confirmation[user.id] = True + except Exception as e: print(f"Failed to DM user {user_name}: {e}") diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 8b4ae325..38125d6e 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1,25 +1,13 @@ { - "1364364429797363722": { + "1364684437752647793": { "total_reports": 1, "reports": [ { - "timestamp": "2025-05-27T19:46:25.232435", - "report_type": "HEALTH MISINFO - TREATMENTS", - "report_content": "this is health misinformation", - "outcome": "Post removed and user removed", - "explanation": "not true" - } - ] - }, - "484531188581793803": { - "total_reports": 1, - "reports": [ - { - "timestamp": "2025-05-27T19:47:11.110317", - "report_type": "HEALTH MISINFO - TREATMENTS", - "report_content": "this is news misinfo political", - "outcome": "Post removed and user removed", - "explanation": "not true" + "timestamp": "2025-05-31T11:41:56.916052", + "report_type": "NEWS MISINFO - POLITICAL", + "report_content": "this is health misinformation (emergency)", + "outcome": "Post removed and user temporarily muted", + "explanation": "false" } ] } From ff9e43553d0dd0a257bec6d45bad606e0ac85d89 Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Sat, 31 May 2025 15:09:52 -0700 Subject: [PATCH 09/17] Add automation of abuse type classification --- DiscordBot/bot.py | 75 +++++++++++++-- DiscordBot/report.py | 190 ++++++++++++++++++++++++++++++++++++- DiscordBot/user_stats.json | 15 +-- 3 files changed, 258 insertions(+), 22 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 212c5acd..37832ff9 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -9,6 +9,7 @@ from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory, State from user_stats import UserStats import pdb +import openai # Set up logging to the console logger = logging.getLogger('discord') @@ -25,6 +26,10 @@ # If you get an error here, it means your token is formatted incorrectly. Did you put it in quotes? tokens = json.load(f) discord_token = tokens['discord'] + openai_api_key = tokens['openai'] + +openai.api_key = openai_api_key +client = openai.OpenAI(api_key=openai_api_key) class ModBot(discord.Client): @@ -41,6 +46,8 @@ def __init__(self): self.user_stats = UserStats() # Initialize user statistics tracking self.awaiting_appeal_confirmation = {} self.awaiting_appeal_reason = {} + self.openai_client = openai.OpenAI(api_key=openai_api_key) + async def on_ready(self): print(f'{self.user.name} has connected to Discord! It is these guilds:') @@ -85,7 +92,7 @@ async def handle_dm(self, message): return # Check if the user is in the middle of an appeal confirmation - if hasattr(self, 'awaiting_appeal_confirmation') and self.awaiting_appeal_confirmation.get(message.author.id): + if self.awaiting_appeal_confirmation.get(message.author.id): if message.content.strip() == '1': # User wants to appeal await message.channel.send("Please provide your reasoning for appealing:") self.awaiting_appeal_confirmation[message.author.id] = False @@ -94,13 +101,15 @@ async def handle_dm(self, message): elif message.content.strip() == '2': # User does not want to appeal await message.channel.send("Thank you.") self.awaiting_appeal_confirmation[message.author.id] = False + # Reset the appeal state for the user + del self.pending_appeals[message.author.id] return else: await message.channel.send("Invalid response. Please reply with 1 for Yes or 2 for No.") return # Check if the user is providing their appeal reasoning - if hasattr(self, 'awaiting_appeal_reason') and self.awaiting_appeal_reason.get(message.author.id): + if self.awaiting_appeal_reason.get(message.author.id): # Process the appeal reasoning info = user_appeals.pop(0) if not user_appeals: @@ -137,11 +146,6 @@ async def handle_dm(self, message): self.awaiting_appeal_reason[message.author.id] = False return - if not hasattr(self, 'awaiting_appeal_confirmation'): - self.awaiting_appeal_confirmation = {} - self.awaiting_appeal_confirmation[message.author.id] = True - return - # Handle a help message if message.content == Report.HELP_KEYWORD: reply = "Use the `report` command to begin the reporting process.\n" @@ -468,6 +472,63 @@ async def handle_mod_channel_message(self, message): }) self.active_mod_flow = None return + + async def classify_abuse_type(self, message_content): + system_prompt = ( + "You are a content moderation assistant. Your job is to classify messages into one of the following top-level abuse types: " + "BULLYING, SUICIDE/SELF-HARM, SEXUALLY EXPLICIT/NUDITY, MISINFORMATION, HATE SPEECH, or DANGER.\n\n" + "If the abuse type is MISINFORMATION, you must specify the misinformation category as:\n" + "- HEALTH (with one of these subcategories: EMERGENCY, MEDICAL RESEARCH, REPRODUCTIVE HEALTH, TREATMENTS, ALTERNATIVE MEDICINE)\n" + "- ADVERTISEMENT\n" + "- NEWS (with one of these subcategories: HISTORICAL, POLITICAL, SCIENTIFIC)\n\n" + "Respond in this format exactly:\n" + "- For general types: `BULLYING`, `HATE SPEECH`, etc.\n" + "- For misinformation types: `HEALTH (EMERGENCY) MISINFORMATION`, `NEWS (POLITICAL) MISINFORMATION`, `ADVERTISEMENT MISINFORMATION`, etc.\n" + "- If the message does not fit any of these categories, respond with: `UNKNOWN`\n\n" + "Only return the final category label." + ) + user_prompt = f"Message: {message_content}\n\nClassify the abuse type:" + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + print("LLM Response:", response) + abuse_type = response.choices[0].message.content.strip().upper() + return abuse_type + except Exception as e: + print(f"Error classifying abuse type: {e}") + return "UNKNOWN" + + def normalize_abuse_type(self, label): + label = label.upper() + if "MISINFORMATION" in label: + # Handle misinformation categories + if "HEALTH" in label: + subcategory = re.search(r"\((.*?)\)", label) + if subcategory: + return f"HEALTH MISINFO - {subcategory.group(1).upper()}" + return "HEALTH MISINFO" + if "ADVERTISEMENT" in label: + return "ADVERTISING MISINFO" + if "NEWS" in label: + subcategory = re.search(r"\((.*?)\)", label) + if subcategory: + return f"NEWS MISINFO - {subcategory.group(1).upper()}" + return "NEWS MISINFO" + # Handle general abuse types + valid_labels = { + "BULLYING": "BULLYING", + "SUICIDE/SELF-HARM": "SUICIDE/SELF-HARM", + "SEXUALLY EXPLICIT/NUDITY": "SEXUALLY EXPLICIT/NUDITY", + "HATE SPEECH": "HATE SPEECH", + "DANGER": "DANGER" + } + return valid_labels.get(label, None) + async def prompt_next_moderation_step(self, mod_channel): await mod_channel.send("Moderator, please review the report and respond with your decision.") diff --git a/DiscordBot/report.py b/DiscordBot/report.py index 72ce422b..9d8bfefe 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -13,6 +13,7 @@ class State(Enum): REPORT_COMPLETE = auto() AWAITING_APPEAL = auto() APPEAL_REVIEW = auto() + AWAITING_USER_CONFIRMATION = auto() class AbuseType(Enum): BULLYING = "bullying" @@ -52,7 +53,7 @@ def __init__(self, client): self.misinfo_category = None self.specific_category = None - async def handle_message(self, message): + """ async def handle_message(self, message): if message.content.lower() == self.CANCEL_KEYWORD: self.state = State.REPORT_COMPLETE return ["Report cancelled."] @@ -217,6 +218,193 @@ async def handle_message(self, message): ) return ["This has been sent to our team."] + return [] """ + + + async def handle_message(self, message): + if message.content.lower() == self.CANCEL_KEYWORD: + self.state = State.REPORT_COMPLETE + return ["Report cancelled."] + + if self.state == State.REPORT_START: + reply = "Thank you for starting the reporting process. " + reply += "Say `help` at any time for more information.\n\n" + reply += "Please copy paste the link to the message you want to report.\n" + reply += "You can obtain this link by right-clicking the message and clicking `Copy Message Link`." + self.state = State.AWAITING_MESSAGE + return [reply] + + if self.state == State.AWAITING_MESSAGE: + m = re.search('/(\d+)/(\d+)/(\d+)', message.content) + if not m: + return ["I'm sorry, I couldn't read that link. Please try again or say `cancel` to cancel."] + guild = self.client.get_guild(int(m.group(1))) + if not guild: + return ["I cannot accept reports of messages from guilds that I'm not in. Please have the guild owner add me to the guild and try again."] + channel = guild.get_channel(int(m.group(2))) + if not channel: + return ["It seems this channel was deleted or never existed. Please try again or say `cancel` to cancel."] + try: + self.message = await channel.fetch_message(int(m.group(3))) + except discord.errors.NotFound: + return ["It seems this message was deleted or never existed. Please try again or say `cancel` to cancel."] + + abuse_type_raw = await self.client.classify_abuse_type(self.message.content) + self.abuse_type = self.client.normalize_abuse_type(abuse_type_raw) + if self.abuse_type: + self.state = State.AWAITING_USER_CONFIRMATION + return [ + f"I found this message:", + f"```{self.message.author.name}: {self.message.content}```", + f"The system classified this message as {self.abuse_type}.", + "Do you agree with this classification?\n1. Yes\n2. No" + ] + else: + # If the LLM cannot classify, fall back to manual abuse type selection + self.state = State.AWAITING_ABUSE_TYPE + reply = "What type of abuse would you like to report?\n" + reply += "1. BULLYING\n" + reply += "2. SUICIDE/SELF-HARM\n" + reply += "3. SEXUALLY EXPLICIT/NUDITY\n" + reply += "4. MISINFORMATION\n" + reply += "5. HATE SPEECH\n" + reply += "6. DANGER" + return [ + f"I found this message:", + f"```{self.message.author.name}: {self.message.content}```", + reply + ] + + if self.state == State.AWAITING_USER_CONFIRMATION: + if message.content.strip() == '1': # User agrees with classification + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"{self.abuse_type} REPORT:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=self.abuse_type, + report_content=self.message.content, + message_author=self.message.author.name + ) + self.state = State.REPORT_COMPLETE + return ["Thank you for confirming. This has been sent to our moderation team for review."] + elif message.content.strip() == '2': # User disagrees with classification + self.state = State.AWAITING_ABUSE_TYPE + reply = "What type of abuse would you like to report?\n" + reply += "1. BULLYING\n" + reply += "2. SUICIDE/SELF-HARM\n" + reply += "3. SEXUALLY EXPLICIT/NUDITY\n" + reply += "4. MISINFORMATION\n" + reply += "5. HATE SPEECH\n" + reply += "6. DANGER" + return [reply] + else: + return ["Invalid response. Please reply with 1 for Yes or 2 for No."] + + if self.state == State.AWAITING_ABUSE_TYPE: + abuse_type = message.content.strip() + abuse_types = { + '1': AbuseType.BULLYING, + '2': AbuseType.SUICIDE, + '3': AbuseType.EXPLICIT, + '4': AbuseType.MISINFORMATION, + '5': AbuseType.HATE, + '6': AbuseType.DANGER + } + + if abuse_type not in abuse_types: + return ["Please select a valid option (1-6) from the list above."] + + self.abuse_type = abuse_types[abuse_type] + + if self.abuse_type == AbuseType.MISINFORMATION: + self.state = State.AWAITING_MISINFO_CATEGORY + return ["Please select the misinformation category:\n1. HEALTH\n2. ADVERTISEMENT\n3. NEWS"] + else: + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"{self.abuse_type.value.upper()} REPORT:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=self.abuse_type.value.upper(), + report_content=self.message.content, + message_author=self.message.author.name + ) + self.state = State.REPORT_COMPLETE + return ["Thank you for reporting. This has been sent to our moderation team for review."] + + if self.state == State.AWAITING_MISINFO_CATEGORY: + category = message.content.strip() + misinfo_categories = { + '1': MisinfoCategory.HEALTH, + '2': MisinfoCategory.ADVERTISEMENT, + '3': MisinfoCategory.NEWS + } + + if category not in misinfo_categories: + return ["Please select a valid option (1-3) from the list above."] + + self.misinfo_category = misinfo_categories[category] + + if self.misinfo_category == MisinfoCategory.HEALTH: + self.state = State.AWAITING_HEALTH_CATEGORY + return ["Please specify the health misinformation category:\n1. EMERGENCY\n2. MEDICAL RESEARCH\n3. REPRODUCTIVE HEALTHCARE\n4. TREATMENTS\n5. ALTERNATIVE MEDICINE"] + elif self.misinfo_category == MisinfoCategory.NEWS: + self.state = State.AWAITING_NEWS_CATEGORY + return ["Please specify the news category:\n1. HISTORICAL\n2. POLITICAL\n3. SCIENCE"] + else: # Advertisement + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type="ADVERTISING MISINFO", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been reported to our ad team."] + + if self.state == State.AWAITING_HEALTH_CATEGORY: + health_cat = message.content.strip() + health_categories = { + '1': HealthCategory.EMERGENCY, + '2': HealthCategory.MEDICAL_RESEARCH, + '3': HealthCategory.REPRODUCTIVE, + '4': HealthCategory.TREATMENTS, + '5': HealthCategory.ALTERNATIVE + } + + if health_cat not in health_categories: + return ["Please select a valid option (1-5) from the list above."] + + self.specific_category = health_categories[health_cat] + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"HEALTH MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"HEALTH MISINFO - {self.specific_category.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been sent to our moderation team."] + + if self.state == State.AWAITING_NEWS_CATEGORY: + news_cat = message.content.strip() + news_categories = { + '1': NewsCategory.HISTORICAL, + '2': NewsCategory.POLITICAL, + '3': NewsCategory.SCIENCE + } + + if news_cat not in news_categories: + return ["Please select a valid option (1-3) from the list above."] + + self.specific_category = news_categories[news_cat] + self.state = State.REPORT_COMPLETE + mod_channel = self.client.mod_channels[self.message.guild.id] + await mod_channel.send(f"NEWS MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") + await self.client.start_moderation_flow( + report_type=f"NEWS MISINFO - {self.specific_category.value.upper()}", + report_content=self.message.content, + message_author=self.message.author.name + ) + return ["This has been sent to our team."] + return [] async def notify_reported_user(self, user_name, guild, outcome, explanation=None): diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 38125d6e..9e26dfee 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1,14 +1 @@ -{ - "1364684437752647793": { - "total_reports": 1, - "reports": [ - { - "timestamp": "2025-05-31T11:41:56.916052", - "report_type": "NEWS MISINFO - POLITICAL", - "report_content": "this is health misinformation (emergency)", - "outcome": "Post removed and user temporarily muted", - "explanation": "false" - } - ] - } -} \ No newline at end of file +{} \ No newline at end of file From 7fbf32c36d5ffef26a406ae7d122003d71020692 Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Sun, 1 Jun 2025 13:24:48 -0700 Subject: [PATCH 10/17] Add automation of risk, post action, and user action --- DiscordBot/bot.py | 493 +++++++++++++++++++++++++++++++++---- DiscordBot/report.py | 168 ------------- DiscordBot/user_stats.json | 15 +- 3 files changed, 465 insertions(+), 211 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 37832ff9..d2359e94 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -201,9 +201,34 @@ async def start_moderation_flow(self, report_type, report_content, message_autho mod_channel = channel break if mod_channel: - await mod_channel.send(f"A new report has been submitted:\nType: {report_type}\nContent: {report_content}\nReported user: {message_author}") if initial_step == 'danger_level': - await mod_channel.send("What is the level of danger for this report?\n1. LOW\n2. MEDIUM\n3. HIGH") + self.active_mod_flow = { + 'step': 'confirm_danger_level', + 'report_type': report_type, + 'report_content': report_content, + 'message_author': message_author, + 'message_link': message_link, + 'context': {} + } + # pick any one mod‐channel + mod_channel = next(iter(self.mod_channels.values()), None) + if not mod_channel: + return + + # Let LLM guess LOW/MEDIUM/HIGH + predicted = await self.classify_danger_level(report_content) + self.active_mod_flow['context']['predicted_danger'] = predicted + + await mod_channel.send( + f"A new report has been submitted:\n" + f"Type: {report_type}\n" + f"Content: {report_content}\n" + f"Reported user: {message_author}\n\n" + f"System suggests danger level: {predicted.upper()}. Do you agree?\n" + "1. Yes\n" + "2. No" + ) + return elif initial_step == 'advertising_done': await mod_channel.send("Report sent to advertising team. No further action required.") self.active_mod_flow = None @@ -265,6 +290,305 @@ async def handle_mod_channel_message(self, message): mod_channel = message.channel guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None + ctx = self.active_mod_flow.get('context', {}) + report_type = self.active_mod_flow['report_type'] + report_content = self.active_mod_flow['report_content'] + reported_user_name = self.active_mod_flow['message_author'] + + if step == 'confirm_danger_level': + if content == '1': # Moderator agrees with LLM + predicted = ctx.get('predicted_danger', 'medium') + ctx['danger_level'] = predicted + + # Now ask LLM to recommend a post‐action + post_action = await self.classify_post_action(report_content, predicted) + ctx['predicted_post_action'] = post_action # e.g. "remove", etc. + + label_map = { + "do_not_recommend": "DO NOT RECOMMEND", + "flag_as_unproven": "FLAG AS UNPROVEN", + "remove": "REMOVE", + "raise": "RAISE", + "report_to_authorities": "REPORT TO AUTHORITIES" + } + action_label = label_map.get(post_action, None) + + if action_label: + await mod_channel.send( + f"System suggests post action: {action_label}. Do you agree?\n" + "1. Yes\n" + "2. No" + ) + self.active_mod_flow['step'] = 'confirm_post_action' + return + else: + # If LLM failed to return a valid post‐action, fall back to manual + if predicted == 'low': + await mod_channel.send( + "Predicted LOW danger. After claim is investigated, what action should be taken on post?\n" + "1. DO NOT RECOMMEND\n" + "2. FLAG AS UNPROVEN" + ) + self.active_mod_flow['step'] = 'low_action_on_post' + return + else: + await mod_channel.send( + f"Predicted {predicted.upper()} danger. After claim is investigated, what action should be taken on post?\n" + "1. REMOVE\n" + "2. RAISE\n" + "3. REPORT TO AUTHORITIES" + ) + self.active_mod_flow['step'] = ('medium_action_on_post' + if predicted == 'medium' else 'high_action_on_post') + return + + if content == '2': # Moderator disagrees with LLM’s danger‐level + await mod_channel.send( + "What is the level of danger for this report?\n" + "1. LOW\n" + "2. MEDIUM\n" + "3. HIGH" + ) + self.active_mod_flow['step'] = 'danger_level_manual' + return + + await mod_channel.send("Invalid response. Please reply with:\n1. Yes\n2. No") + return + + if step == 'danger_level_manual': + if content not in ['1','2','3']: + await mod_channel.send("Invalid option. Please choose:\n1. LOW\n2. MEDIUM\n3. HIGH") + return + + levels = {'1':'low','2':'medium','3':'high'} + chosen = levels[content] + ctx['danger_level'] = chosen + + if chosen == 'low': + await mod_channel.send( + "Flag post as LOW danger. After claim is investigated, what action should be taken on post?\n" + "1. DO NOT RECOMMEND\n" + "2. FLAG AS UNPROVEN" + ) + self.active_mod_flow['step'] = 'low_action_on_post' + else: + await mod_channel.send( + f"Flag post as {chosen.upper()} danger. After claim is investigated, what action should be taken on post?\n" + "1. REMOVE\n" + "2. RAISE\n" + "3. REPORT TO AUTHORITIES" + ) + self.active_mod_flow['step'] = ('medium_action_on_post' + if chosen == 'medium' else 'high_action_on_post') + return + + if step == 'confirm_post_action': + if content == '1': # Mod agrees with LLM’s post‐action + post_action = ctx.get('predicted_post_action') + danger = ctx.get('danger_level') + # Retrieve the reported User object + reported_user = discord.utils.get(guild.members, name=reported_user_name) + + # LOW‐danger branches + if danger == 'low': + if post_action == 'do_not_recommend': + await mod_channel.send( + "Post will not be recommended. Action recorded. " + "(Update algorithm so post is not recommended.)" + ) + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post not recommended" + ) + await self.notify_reported_user( + reported_user_name, guild, + outcome="Post not recommended." + ) + self.active_mod_flow = None + return + + elif post_action == 'flag_as_unproven': + await mod_channel.send( + "System suggests FLAG AS UNPROVEN. " + "Please add explanation for why post is being flagged." + ) + self.active_mod_flow['step'] = 'flag_explanation' + return + + # MEDIUM/HIGH‐danger branches + else: + if post_action == 'remove': + await mod_channel.send( + "System suggests REMOVE. Please add explanation for why post is being removed." + ) + self.active_mod_flow['step'] = 'remove_explanation' + return + + elif post_action == 'raise': + await mod_channel.send( + "System suggests RAISE to higher level moderator. " + "Report sent to higher level moderators." + ) + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Report raised to higher level moderator" + ) + self.active_mod_flow = None + return + + elif post_action == 'report_to_authorities': + await mod_channel.send( + "System suggests REPORT TO AUTHORITIES. Report sent to authorities." + ) + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Reported to authorities" + ) + self.active_mod_flow = None + return + + # Fallback if LLM recommendation is invalid + await mod_channel.send("Could not interpret recommended post action. Please choose manually.") + danger = ctx.get('danger_level') + if danger == 'low': + await mod_channel.send( + "After claim is investigated, what action should be taken on post?\n" + "1. DO NOT RECOMMEND\n" + "2. FLAG AS UNPROVEN" + ) + self.active_mod_flow['step'] = 'low_action_on_post' + else: + await mod_channel.send( + "After claim is investigated, what action should be taken on post?\n" + "1. REMOVE\n" + "2. RAISE\n" + "3. REPORT TO AUTHORITIES" + ) + self.active_mod_flow['step'] = ( + 'medium_action_on_post' if danger == 'medium' else 'high_action_on_post' + ) + return + + if content == '2': # Mod overrides–go manual + danger = ctx.get('danger_level') + if danger == 'low': + await mod_channel.send( + "What action should be taken on post?\n" + "1. DO NOT RECOMMEND\n" + "2. FLAG AS UNPROVEN" + ) + self.active_mod_flow['step'] = 'low_action_on_post' + else: + await mod_channel.send( + "What action should be taken on post?\n" + "1. REMOVE\n" + "2. RAISE\n" + "3. REPORT TO AUTHORITIES" + ) + self.active_mod_flow['step'] = ( + 'medium_action_on_post' if danger == 'medium' else 'high_action_on_post' + ) + return + + await mod_channel.send("Invalid response. Please reply with:\n1. Yes\n2. No") + return + + if step == 'confirm_user_action': + if content == '1': # Mod agrees with LLM’s user‐action + user_action = ctx.get('predicted_user_action') + reported_user = discord.utils.get(guild.members, name=reported_user_name) + + if user_action == 'record_incident': + await mod_channel.send("Incident recorded for internal use. (Add to internal incident count for user.)") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and incident recorded", + ctx.get('remove_explanation', '') + ) + self.active_mod_flow = None + return + + elif user_action == 'temporarily_mute': + await mod_channel.send("User will be muted for 24 hours.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and user temporarily muted", + ctx.get('remove_explanation', '') + ) + await self.notify_reported_user( + reported_user_name, + guild, + outcome="You have been temporarily muted.", + explanation="You violated the community guidelines.", + original_message=report_content + ) + self.active_mod_flow = None + return + + elif user_action == 'remove_user': + await mod_channel.send("User will be removed.") + self.user_stats.add_report( + reported_user.id, + report_type, + report_content, + "Post removed and user removed", + ctx.get('remove_explanation', '') + ) + await self.notify_reported_user( + reported_user_name, + guild, + outcome="You have been removed from the server.", + explanation="You violated the community guidelines.", + original_message=report_content + ) + # Track for appeal if removed + user_obj = reported_user + if user_obj: + if user_obj.id not in self.pending_appeals: + self.pending_appeals[user_obj.id] = [] + self.pending_appeals[user_obj.id].append({ + 'guild_id': guild.id, + 'reported_name': reported_user_name, + 'outcome': "You have been removed from the server.", + 'original_message': report_content, + 'explanation': "You violated the community guidelines." + }) + self.active_mod_flow = None + return + + # Fallback to manual if LLM output was unexpected + await mod_channel.send( + "Could not interpret recommended user action. Please choose manually:\n" + "1. RECORD INCIDENT\n" + "2. TEMPORARILY MUTE\n" + "3. REMOVE USER" + ) + self.active_mod_flow['step'] = 'action_on_user' + return + + if content == '2': # Mod overrides → manual user‐action + await mod_channel.send( + "What action should be taken on the creator of the post?\n" + "1. RECORD INCIDENT\n" + "2. TEMPORARILY MUTE\n" + "3. REMOVE USER" + ) + self.active_mod_flow['step'] = 'action_on_user' + return + + await mod_channel.send("Invalid response. Please reply with:\n1. Yes\n2. No") + return + if step == 'appeal_review': if content == '1': await mod_channel.send("The appeal has been accepted. The original decision has been overturned.") @@ -302,24 +626,6 @@ async def handle_mod_channel_message(self, message): # Already handled self.active_mod_flow = None return - if step == 'danger_level': - if content not in ['1', '2', '3']: - await mod_channel.send("Invalid option. Please choose:\n1. LOW\n2. MEDIUM\n3. HIGH") - return - danger_levels = {'1': 'low', '2': 'medium', '3': 'high'} - ctx['danger_level'] = danger_levels[content] - if content == '1': # LOW - await mod_channel.send("Flag post as low danger. After claim is investigated, what action should be taken on post?\n1. DO NOT RECOMMEND\n2. FLAG AS UNPROVEN") - self.active_mod_flow['step'] = 'low_action_on_post' - return - elif content == '2': # MEDIUM - await mod_channel.send("Flag post as medium danger. After claim is investigated, what action should be taken on post?\n1. REMOVE\n2. RAISE\n3. REPORT TO AUTHORITIES") - self.active_mod_flow['step'] = 'medium_action_on_post' - return - elif content == '3': # HIGH - await mod_channel.send("Flag post as high danger. What emergency action should be taken based on post?\n1. REMOVE\n2. RAISE\n3. REPORT TO AUTHORITIES") - self.active_mod_flow['step'] = 'high_action_on_post' - return if step == 'low_action_on_post': if content not in ['1', '2']: await mod_channel.send("Invalid option. Please choose:\n1. DO NOT RECOMMEND\n2. FLAG AS UNPROVEN") @@ -382,33 +688,47 @@ async def handle_mod_channel_message(self, message): if step == 'remove_explanation': explanation = message.content ctx['remove_explanation'] = explanation + + # Notify user that their post was removed await self.notify_reported_user( reported_user_name, guild, outcome="Post removed.", - explanation=ctx.get('remove_explanation', ''), + explanation=explanation, original_message=report_content ) - # Send only the appeal prompt - user = discord.utils.get(guild.members, name=reported_user_name) - if user: - # Track for incoming DM in pending_appeals - if user.id not in self.pending_appeals: - self.pending_appeals[user.id] = [] - self.pending_appeals[user.id].append({ - 'guild_id': guild.id, - 'reported_name': reported_user_name, - 'outcome': "Post removed.", - 'original_message': report_content, - 'explanation': explanation - }) - await mod_channel.send( - f"Explanation recorded: {explanation}\n" - "What action should be taken on the creator of the post?\n" - "1. RECORD INCIDENT\n2. TEMPORARILY MUTE\n3. REMOVE USER" - ) - self.active_mod_flow['step'] = 'action_on_user' - return + + # 1) Let LLM recommend a user‐action now that post is removed + recommended = await self.classify_user_action(report_content, + ctx.get('danger_level', 'medium'), + 'remove') + ctx['predicted_user_action'] = recommended + + label_map = { + "record_incident": "RECORD INCIDENT", + "temporarily_mute": "TEMPORARILY MUTE", + "remove_user": "REMOVE USER" + } + action_label = label_map.get(recommended, None) + + if action_label: + await mod_channel.send( + f"System suggests user action: {action_label}. Do you agree?\n" + "1. Yes\n" + "2. No" + ) + self.active_mod_flow['step'] = 'confirm_user_action' + return + else: + # If LLM failed, fall back to manual: + await mod_channel.send( + "What action should be taken on the creator of the post?\n" + "1. RECORD INCIDENT\n" + "2. TEMPORARILY MUTE\n" + "3. REMOVE USER" + ) + self.active_mod_flow['step'] = 'action_on_user' + return if step == 'action_on_user': if content not in ['1', '2', '3']: await mod_channel.send("Invalid option. Please choose:\n1. RECORD INCIDENT\n2. TEMPORARILY MUTE\n3. REMOVE USER") @@ -496,7 +816,6 @@ async def classify_abuse_type(self, message_content): {"role": "user", "content": user_prompt} ] ) - print("LLM Response:", response) abuse_type = response.choices[0].message.content.strip().upper() return abuse_type except Exception as e: @@ -528,7 +847,97 @@ def normalize_abuse_type(self, label): "DANGER": "DANGER" } return valid_labels.get(label, None) + + async def classify_danger_level(self, report_content): + system_prompt = ( + "You are a content moderation assistant. " + "Given a single user post, classify how dangerous its misinformation is to real-world safety. " + "Choose exactly one of: LOW, MEDIUM, or HIGH." + ) + user_prompt = f"Post: {report_content}\n\nClassify danger level:" + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + lvl = response.choices[0].message.content.strip().upper() + if lvl in ("LOW", "MEDIUM", "HIGH"): + return lvl.lower() + return "medium" + except Exception: + return "medium" + + async def classify_post_action(self, report_content, danger_level): + system_prompt = ( + "You are a content moderation assistant. " + "Given a user post and its danger level (LOW, MEDIUM, or HIGH), " + "recommend exactly one action to take on the post.\n" + "- If danger is LOW, choose ONE of: DO NOT RECOMMEND or FLAG AS UNPROVEN.\n" + "- If danger is MEDIUM or HIGH, choose ONE of: REMOVE, RAISE, or REPORT TO AUTHORITIES.\n" + "Respond with exactly one of those labels." + ) + user_prompt = ( + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended post action:" + ) + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + action = response.choices[0].message.content.strip().upper() + mapping = { + "DO NOT RECOMMEND": "do_not_recommend", + "FLAG AS UNPROVEN": "flag_as_unproven", + "REMOVE": "remove", + "RAISE": "raise", + "REPORT TO AUTHORITIES": "report_to_authorities" + } + return mapping.get(action, None) + except Exception: + return None + + async def classify_user_action(self, report_content, danger_level, post_action): + if post_action != "remove": + return None + system_prompt = ( + "You are a content moderation assistant. A post has been determined to be REMOVED. " + "Recommend exactly one follow‐up action on the user:\n" + "- RECORD INCIDENT\n" + "- TEMPORARILY MUTE\n" + "- REMOVE USER\n" + "Respond with exactly one label." + ) + user_prompt = ( + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended user action:" + ) + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + action = response.choices[0].message.content.strip().upper() + mapping = { + "RECORD INCIDENT": "record_incident", + "TEMPORARILY MUTE": "temporarily_mute", + "REMOVE USER": "remove_user" + } + return mapping.get(action, None) + except Exception: + return None async def prompt_next_moderation_step(self, mod_channel): await mod_channel.send("Moderator, please review the report and respond with your decision.") diff --git a/DiscordBot/report.py b/DiscordBot/report.py index 9d8bfefe..5861d206 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -53,174 +53,6 @@ def __init__(self, client): self.misinfo_category = None self.specific_category = None - """ async def handle_message(self, message): - if message.content.lower() == self.CANCEL_KEYWORD: - self.state = State.REPORT_COMPLETE - return ["Report cancelled."] - - if self.state == State.REPORT_START: - reply = "Thank you for starting the reporting process. " - reply += "Say `help` at any time for more information.\n\n" - reply += "Please copy paste the link to the message you want to report.\n" - reply += "You can obtain this link by right-clicking the message and clicking `Copy Message Link`." - self.state = State.AWAITING_MESSAGE - return [reply] - - if self.state == State.AWAITING_MESSAGE: - m = re.search('/(\d+)/(\d+)/(\d+)', message.content) - if not m: - return ["I'm sorry, I couldn't read that link. Please try again or say `cancel` to cancel."] - guild = self.client.get_guild(int(m.group(1))) - if not guild: - return ["I cannot accept reports of messages from guilds that I'm not in. Please have the guild owner add me to the guild and try again."] - channel = guild.get_channel(int(m.group(2))) - if not channel: - return ["It seems this channel was deleted or never existed. Please try again or say `cancel` to cancel."] - try: - self.message = await channel.fetch_message(int(m.group(3))) - except discord.errors.NotFound: - return ["It seems this message was deleted or never existed. Please try again or say `cancel` to cancel."] - - self.state = State.AWAITING_ABUSE_TYPE - reply = "What type of abuse would you like to report?\n" - reply += "1. BULLYING\n" - reply += "2. SUICIDE/SELF-HARM\n" - reply += "3. SEXUALLY EXPLICIT/NUDITY\n" - reply += "4. MISINFORMATION\n" - reply += "5. HATE SPEECH\n" - reply += "6. DANGER" - return ["I found this message:", "```" + self.message.author.name + ": " + self.message.content + "```", reply] - - if self.state == State.AWAITING_ABUSE_TYPE: - abuse_type = message.content.strip() - abuse_types = { - '1': AbuseType.BULLYING, - '2': AbuseType.SUICIDE, - '3': AbuseType.EXPLICIT, - '4': AbuseType.MISINFORMATION, - '5': AbuseType.HATE, - '6': AbuseType.DANGER - } - - if abuse_type not in abuse_types: - return ["Please select a valid option (1-6) from the list above."] - - self.abuse_type = abuse_types[abuse_type] - - if self.abuse_type == AbuseType.SUICIDE: - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"SUICIDE/SELF-HARM REPORT:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type="SUICIDE/SELF-HARM", - report_content=self.message.content, - message_author=self.message.author.name - ) - self.state = State.REPORT_COMPLETE - return ["Thank you for reporting. This has been sent to our moderation team for review."] - - if self.abuse_type == AbuseType.EXPLICIT: - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"EXPLICIT CONTENT REPORT:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type="EXPLICIT CONTENT", - report_content=self.message.content, - message_author=self.message.author.name - ) - self.state = State.REPORT_COMPLETE - return ["Thank you for reporting. This has been sent to our moderation team for review."] - - if self.abuse_type == AbuseType.MISINFORMATION: - self.state = State.AWAITING_MISINFO_CATEGORY - return ["Please select the misinformation category:\n1. HEALTH\n2. ADVERTISEMENT\n3. NEWS"] - else: - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"New report - {self.abuse_type.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=self.abuse_type.value.upper(), - report_content=self.message.content, - message_author=self.message.author.name - ) - self.state = State.REPORT_COMPLETE - return ["Thank you for reporting, it has been sent to our moderation team."] - - if self.state == State.AWAITING_MISINFO_CATEGORY: - category = message.content.strip() - misinfo_categories = { - '1': MisinfoCategory.HEALTH, - '2': MisinfoCategory.ADVERTISEMENT, - '3': MisinfoCategory.NEWS - } - - if category not in misinfo_categories: - return ["Please select a valid option (1-3) from the list above."] - - self.misinfo_category = misinfo_categories[category] - - if self.misinfo_category == MisinfoCategory.HEALTH: - self.state = State.AWAITING_HEALTH_CATEGORY - return ["Please specify the health misinformation category:\n1. EMERGENCY\n2. MEDICAL RESEARCH\n3. REPRODUCTIVE HEALTHCARE\n4. TREATMENTS\n5. ALTERNATIVE MEDICINE"] - elif self.misinfo_category == MisinfoCategory.NEWS: - self.state = State.AWAITING_NEWS_CATEGORY - return ["Please specify the news category:\n1. HISTORICAL\n2. POLITICAL\n3. SCIENCE"] - else: # Advertisement - self.state = State.REPORT_COMPLETE - await self.client.mod_channels[self.message.guild.id].send(f"ADVERTISING MISINFO:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type="ADVERTISING MISINFO", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been reported to our ad team."] - - if self.state == State.AWAITING_HEALTH_CATEGORY: - health_cat = message.content.strip() - health_categories = { - '1': HealthCategory.EMERGENCY, - '2': HealthCategory.MEDICAL_RESEARCH, - '3': HealthCategory.REPRODUCTIVE, - '4': HealthCategory.TREATMENTS, - '5': HealthCategory.ALTERNATIVE - } - - if health_cat not in health_categories: - return ["Please select a valid option (1-5) from the list above."] - - self.specific_category = health_categories[health_cat] - self.state = State.REPORT_COMPLETE - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"HEALTH MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=f"HEALTH MISINFO - {self.specific_category.value.upper()}", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been sent to our moderation team."] - - if self.state == State.AWAITING_NEWS_CATEGORY: - news_cat = message.content.strip() - news_categories = { - '1': NewsCategory.HISTORICAL, - '2': NewsCategory.POLITICAL, - '3': NewsCategory.SCIENCE - } - - if news_cat not in news_categories: - return ["Please select a valid option (1-3) from the list above."] - - self.specific_category = news_categories[news_cat] - self.state = State.REPORT_COMPLETE - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"NEWS MISINFO - {self.specific_category.value.upper()}:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=f"NEWS MISINFO - {self.specific_category.value.upper()}", - report_content=self.message.content, - message_author=self.message.author.name - ) - return ["This has been sent to our team."] - - return [] """ - - async def handle_message(self, message): if message.content.lower() == self.CANCEL_KEYWORD: self.state = State.REPORT_COMPLETE diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 9e26dfee..f5b25e22 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "1364684437752647793": { + "total_reports": 1, + "reports": [ + { + "timestamp": "2025-06-01T12:53:15.987346", + "report_type": "NEWS MISINFO - POLITICAL", + "report_content": "The recent election was rigged using a secret satellite controlled by a foreign government.", + "outcome": "Post removed and user temporarily muted", + "explanation": "false info" + } + ] + } +} \ No newline at end of file From 97a1f981103f954bbe063081378911ecd6263cee Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Sun, 1 Jun 2025 23:15:40 -0700 Subject: [PATCH 11/17] Add ability for user to add context and fix appeal --- DiscordBot/bot.py | 324 +++++++++++++++++++++++++------------ DiscordBot/report.py | 66 ++++++-- DiscordBot/user_stats.json | 4 +- 3 files changed, 275 insertions(+), 119 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index d2359e94..cade0028 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -111,10 +111,7 @@ async def handle_dm(self, message): # Check if the user is providing their appeal reasoning if self.awaiting_appeal_reason.get(message.author.id): # Process the appeal reasoning - info = user_appeals.pop(0) - if not user_appeals: - # Remove the user from pending_appeals if no appeals remain - del self.pending_appeals[message.author.id] + info = user_appeals[0] mod_chan = self.mod_channels[info['guild_id']] @@ -135,6 +132,7 @@ async def handle_dm(self, message): # Prompt mods for ACCEPT/UPHOLD self.active_mod_flow = { 'step': 'appeal_review', + 'info': info, 'message_author': info['reported_name'], 'context': {}, 'guild_id': info['guild_id'] @@ -180,63 +178,95 @@ async def handle_channel_message(self, message): elif message.channel.name == f'group-{self.group_num}': return - async def start_moderation_flow(self, report_type, report_content, message_author, message_link=None): - # Determine the initial step based on report type - if report_type.startswith('ADVERTISING MISINFO'): - initial_step = 'advertising_done' - elif report_type.startswith('MISINFORMATION') or report_type.startswith('HEALTH MISINFO') or report_type.startswith('NEWS MISINFO'): - initial_step = 'danger_level' - else: - initial_step = 'default_done' - self.active_mod_flow = { - 'step': initial_step, - 'report_type': report_type, - 'report_content': report_content, - 'message_author': message_author, - 'message_link': message_link, - 'context': {} - } - mod_channel = None - for channel in self.mod_channels.values(): - mod_channel = channel - break - if mod_channel: + async def start_moderation_flow( + self, + report_type, + report_content, + message_author, + user_context=None, + message_link=None + ): + # Determine the initial step based on report type + if report_type.startswith('ADVERTISING MISINFO'): + initial_step = 'advertising_done' + elif ( + report_type.startswith('MISINFORMATION') + or report_type.startswith('HEALTH MISINFO') + or report_type.startswith('NEWS MISINFO') + ): + initial_step = 'danger_level' + else: + initial_step = 'default_done' + + # Store everything (including user_context) up front + self.active_mod_flow = { + 'step': initial_step, + 'report_type': report_type, + 'report_content': report_content, + 'message_author': message_author, + 'message_link': message_link, + 'user_context': user_context, + 'context': {} + } + + # Pick any one moderator channel + mod_channel = None + for channel in self.mod_channels.values(): + mod_channel = channel + break + + if not mod_channel: + return + + # If this is a misinformation‐type report, run the danger‐level flow if initial_step == 'danger_level': - self.active_mod_flow = { - 'step': 'confirm_danger_level', - 'report_type': report_type, - 'report_content': report_content, - 'message_author': message_author, - 'message_link': message_link, - 'context': {} - } - # pick any one mod‐channel - mod_channel = next(iter(self.mod_channels.values()), None) - if not mod_channel: - return + # Update the step + self.active_mod_flow['step'] = 'confirm_danger_level' - # Let LLM guess LOW/MEDIUM/HIGH - predicted = await self.classify_danger_level(report_content) + # Let LLM guess LOW/MEDIUM/HIGH, passing along user_context + predicted = await self.classify_danger_level( + report_content, + user_context + ) self.active_mod_flow['context']['predicted_danger'] = predicted - await mod_channel.send( + # Build "new report" message and include user_context if provided + base_msg = ( f"A new report has been submitted:\n" f"Type: {report_type}\n" f"Content: {report_content}\n" - f"Reported user: {message_author}\n\n" - f"System suggests danger level: {predicted.upper()}. Do you agree?\n" + f"Reported user: {message_author}\n" + ) + if user_context: + base_msg += f"User context: {user_context}\n" + + base_msg += ( + f"\nSystem suggests danger level: {predicted.upper()}. Do you agree?\n" "1. Yes\n" "2. No" ) + await mod_channel.send(base_msg) return - elif initial_step == 'advertising_done': - await mod_channel.send("Report sent to advertising team. No further action required.") + + # Otherwise, handle the other two cases + if initial_step == 'advertising_done': + await mod_channel.send( + "Report sent to advertising team. No further action required." + ) self.active_mod_flow = None - elif initial_step == 'default_done': + return + + if initial_step == 'default_done': # Just show the report, do not prompt for reply + await mod_channel.send( + f"A new report has been submitted:\n" + f"Type: {report_type}\n" + f"Content: {report_content}\n" + f"Reported user: {message_author}" + ) self.active_mod_flow = None - else: - await self.prompt_next_moderation_step(mod_channel) + return + await self.prompt_next_moderation_step(mod_channel) async def notify_reported_user(self, user_name, guild, outcome, explanation=None, original_message=None): """Notify the user about the outcome and provide an appeal option.""" @@ -290,6 +320,63 @@ async def handle_mod_channel_message(self, message): mod_channel = message.channel guild = mod_channel.guild if hasattr(mod_channel, 'guild') else None + if step == 'appeal_review': + # Pull the info dict that was stashed earlier + info = self.active_mod_flow.get('info', {}) + reported_name = info.get('reported_name') + + # Look up the User object in this guild + reported_user = discord.utils.get(guild.members, name=reported_name) + user_id = reported_user.id if reported_user else None + + # 1) Pop this appeal out of the queue + if user_id in self.pending_appeals: + self.pending_appeals[user_id].pop(0) + if not self.pending_appeals[user_id]: + del self.pending_appeals[user_id] + + # 2) Send the DM back to the user with the moderator's decision + if content == '1': # ACCEPT + await mod_channel.send("The appeal has been accepted. The original decision has been overturned.") + if reported_user: + await reported_user.send( + "Your appeal has been accepted. The original decision has been overturned." + ) + + elif content == '2': # UPHOLD + await mod_channel.send("The appeal has been reviewed and the original decision is upheld.") + if reported_user: + await reported_user.send( + "Your appeal has been reviewed, and the original decision is upheld." + ) + + else: + await mod_channel.send("Invalid response. Please respond with:\n1. ACCEPT\n2. UPHOLD") + return + + # Clear this flow + self.active_mod_flow = None + + # 3) If that user still has more pending appeals, prompt them again + if user_id in self.pending_appeals and self.pending_appeals[user_id]: + next_info = self.pending_appeals[user_id][0] + try: + prompt_text = ( + f"Your message was reviewed by moderators. Outcome: {next_info['outcome']}.\n\n" + f"Original Message:\n{next_info['original_message']}\n\n" + ) + if next_info.get('explanation'): + prompt_text += f"Reason: {next_info['explanation']}\n\n" + prompt_text += ( + "If you believe this was a mistake, you may reply to this message to appeal. " + "Would you like to appeal this decision?\n1. Yes\n2. No" + ) + await reported_user.send(prompt_text) + self.awaiting_appeal_confirmation[user_id] = True + except Exception: + pass + return + ctx = self.active_mod_flow.get('context', {}) report_type = self.active_mod_flow['report_type'] report_content = self.active_mod_flow['report_content'] @@ -301,7 +388,11 @@ async def handle_mod_channel_message(self, message): ctx['danger_level'] = predicted # Now ask LLM to recommend a post‐action - post_action = await self.classify_post_action(report_content, predicted) + post_action = await self.classify_post_action( + report_content, + predicted, + self.active_mod_flow.get('user_context') + ) ctx['predicted_post_action'] = post_action # e.g. "remove", etc. label_map = { @@ -364,22 +455,49 @@ async def handle_mod_channel_message(self, message): chosen = levels[content] ctx['danger_level'] = chosen - if chosen == 'low': + # Ask LLM to recommend a post‐action given the manually chosen danger level: + predicted_action = await self.classify_post_action( + report_content, + chosen, + self.active_mod_flow.get('user_context') + ) + ctx['predicted_post_action'] = predicted_action + + label_map = { + "do_not_recommend": "DO NOT RECOMMEND", + "flag_as_unproven": "FLAG AS UNPROVEN", + "remove": "REMOVE", + "raise": "RAISE", + "report_to_authorities": "REPORT TO AUTHORITIES" + } + action_label = label_map.get(predicted_action, None) + + if action_label: await mod_channel.send( - "Flag post as LOW danger. After claim is investigated, what action should be taken on post?\n" - "1. DO NOT RECOMMEND\n" - "2. FLAG AS UNPROVEN" + f"System suggests post action: {action_label}. Do you agree?\n" + "1. Yes\n" + "2. No" ) - self.active_mod_flow['step'] = 'low_action_on_post' + self.active_mod_flow['step'] = 'confirm_post_action' else: - await mod_channel.send( - f"Flag post as {chosen.upper()} danger. After claim is investigated, what action should be taken on post?\n" - "1. REMOVE\n" - "2. RAISE\n" - "3. REPORT TO AUTHORITIES" - ) - self.active_mod_flow['step'] = ('medium_action_on_post' - if chosen == 'medium' else 'high_action_on_post') + # Fallback if LLM failed to return a valid post‐action: + if chosen == 'low': + await mod_channel.send( + "Predicted LOW danger. After claim is investigated, what action should be taken on post?\n" + "1. DO NOT RECOMMEND\n" + "2. FLAG AS UNPROVEN" + ) + self.active_mod_flow['step'] = 'low_action_on_post' + else: + await mod_channel.send( + f"Predicted {chosen.upper()} danger. After claim is investigated, what action should be taken on post?\n" + "1. REMOVE\n" + "2. RAISE\n" + "3. REPORT TO AUTHORITIES" + ) + self.active_mod_flow['step'] = ( + 'medium_action_on_post' if chosen == 'medium' else 'high_action_on_post' + ) return if step == 'confirm_post_action': @@ -589,27 +707,6 @@ async def handle_mod_channel_message(self, message): await mod_channel.send("Invalid response. Please reply with:\n1. Yes\n2. No") return - if step == 'appeal_review': - if content == '1': - await mod_channel.send("The appeal has been accepted. The original decision has been overturned.") - user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) - if user: - await user.send("Your appeal has been accepted. The original decision has been overturned.") - self.active_mod_flow = None - return - - elif content == '2': - await mod_channel.send("The appeal has been reviewed and the original decision is upheld.") - user = discord.utils.get(guild.members, name=self.active_mod_flow['message_author']) - if user: - await user.send("Your appeal has been reviewed, and the original decision is upheld.") - self.active_mod_flow = None - return - - else: - await mod_channel.send("Invalid response. Please respond with:\n1. ACCEPT\n2. UPHOLD") - return - ctx = self.active_mod_flow['context'] report_type = self.active_mod_flow['report_type'] report_content = self.active_mod_flow['report_content'] @@ -699,9 +796,12 @@ async def handle_mod_channel_message(self, message): ) # 1) Let LLM recommend a user‐action now that post is removed - recommended = await self.classify_user_action(report_content, - ctx.get('danger_level', 'medium'), - 'remove') + recommended = await self.classify_user_action( + report_content, + ctx.get('danger_level', 'medium'), + 'remove', + self.active_mod_flow.get('user_context') + ) ctx['predicted_user_action'] = recommended label_map = { @@ -793,7 +893,7 @@ async def handle_mod_channel_message(self, message): self.active_mod_flow = None return - async def classify_abuse_type(self, message_content): + async def classify_abuse_type(self, message_content, user_context=None): system_prompt = ( "You are a content moderation assistant. Your job is to classify messages into one of the following top-level abuse types: " "BULLYING, SUICIDE/SELF-HARM, SEXUALLY EXPLICIT/NUDITY, MISINFORMATION, HATE SPEECH, or DANGER.\n\n" @@ -807,7 +907,10 @@ async def classify_abuse_type(self, message_content): "- If the message does not fit any of these categories, respond with: `UNKNOWN`\n\n" "Only return the final category label." ) - user_prompt = f"Message: {message_content}\n\nClassify the abuse type:" + if user_context: + user_prompt = f"User context: {user_context}\nMessage: {message_content}\n\nClassify the abuse type:" + else: + user_prompt = f"Message: {message_content}\n\nClassify the abuse type:" try: response = self.openai_client.chat.completions.create( model="gpt-4o", @@ -848,13 +951,16 @@ def normalize_abuse_type(self, label): } return valid_labels.get(label, None) - async def classify_danger_level(self, report_content): + async def classify_danger_level(self, report_content, user_context=None): system_prompt = ( "You are a content moderation assistant. " "Given a single user post, classify how dangerous its misinformation is to real-world safety. " "Choose exactly one of: LOW, MEDIUM, or HIGH." ) - user_prompt = f"Post: {report_content}\n\nClassify danger level:" + if user_context: + user_prompt = f"User context: {user_context}\nPost: {report_content}\n\nClassify danger level:" + else: + user_prompt = f"Post: {report_content}\n\nClassify danger level:" try: response = self.openai_client.chat.completions.create( model="gpt-4o", @@ -870,7 +976,7 @@ async def classify_danger_level(self, report_content): except Exception: return "medium" - async def classify_post_action(self, report_content, danger_level): + async def classify_post_action(self, report_content, danger_level, user_context=None): system_prompt = ( "You are a content moderation assistant. " "Given a user post and its danger level (LOW, MEDIUM, or HIGH), " @@ -879,11 +985,19 @@ async def classify_post_action(self, report_content, danger_level): "- If danger is MEDIUM or HIGH, choose ONE of: REMOVE, RAISE, or REPORT TO AUTHORITIES.\n" "Respond with exactly one of those labels." ) - user_prompt = ( - f"Post: {report_content}\n" - f"Danger level: {danger_level.upper()}\n\n" - "Recommended post action:" - ) + if user_context: + user_prompt = ( + f"User context: {user_context}\n" + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended post action:" + ) + else: + user_prompt = ( + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended post action:" + ) try: response = self.openai_client.chat.completions.create( model="gpt-4o", @@ -904,7 +1018,7 @@ async def classify_post_action(self, report_content, danger_level): except Exception: return None - async def classify_user_action(self, report_content, danger_level, post_action): + async def classify_user_action(self, report_content, danger_level, post_action, user_context=None): if post_action != "remove": return None @@ -916,11 +1030,19 @@ async def classify_user_action(self, report_content, danger_level, post_action): "- REMOVE USER\n" "Respond with exactly one label." ) - user_prompt = ( - f"Post: {report_content}\n" - f"Danger level: {danger_level.upper()}\n\n" - "Recommended user action:" - ) + if user_context: + user_prompt = ( + f"User context: {user_context}\n" + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended user action:" + ) + else: + user_prompt = ( + f"Post: {report_content}\n" + f"Danger level: {danger_level.upper()}\n\n" + "Recommended user action:" + ) try: response = self.openai_client.chat.completions.create( model="gpt-4o", diff --git a/DiscordBot/report.py b/DiscordBot/report.py index 5861d206..d04568d0 100644 --- a/DiscordBot/report.py +++ b/DiscordBot/report.py @@ -14,6 +14,8 @@ class State(Enum): AWAITING_APPEAL = auto() APPEAL_REVIEW = auto() AWAITING_USER_CONFIRMATION = auto() + AWAITING_CONTEXT_CONFIRMATION = auto() + AWAITING_CONTEXT_TEXT = auto() class AbuseType(Enum): BULLYING = "bullying" @@ -52,6 +54,7 @@ def __init__(self, client): self.abuse_type = None self.misinfo_category = None self.specific_category = None + self.user_context = None async def handle_message(self, message): if message.content.lower() == self.CANCEL_KEYWORD: @@ -109,15 +112,15 @@ async def handle_message(self, message): if self.state == State.AWAITING_USER_CONFIRMATION: if message.content.strip() == '1': # User agrees with classification - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"{self.abuse_type} REPORT:\n{self.message.author.name}: {self.message.content}") - await self.client.start_moderation_flow( - report_type=self.abuse_type, - report_content=self.message.content, - message_author=self.message.author.name - ) - self.state = State.REPORT_COMPLETE - return ["Thank you for confirming. This has been sent to our moderation team for review."] + self.state = State.AWAITING_CONTEXT_CONFIRMATION + # stash everything needed: + self.pending_report = { + 'report_type': self.abuse_type, + 'report_content': self.message.content, + 'message_author': self.message.author.name + } + return ["Do you want to add additional context for why you are reporting this message?\n1. Yes\n2. No"] + elif message.content.strip() == '2': # User disagrees with classification self.state = State.AWAITING_ABUSE_TYPE reply = "What type of abuse would you like to report?\n" @@ -151,15 +154,46 @@ async def handle_message(self, message): self.state = State.AWAITING_MISINFO_CATEGORY return ["Please select the misinformation category:\n1. HEALTH\n2. ADVERTISEMENT\n3. NEWS"] else: - mod_channel = self.client.mod_channels[self.message.guild.id] - await mod_channel.send(f"{self.abuse_type.value.upper()} REPORT:\n{self.message.author.name}: {self.message.content}") + self.state = State.AWAITING_CONTEXT_CONFIRMATION + self.pending_report = { + 'report_type': self.abuse_type.value.upper(), + 'report_content': self.message.content, + 'message_author': self.message.author.name + } + return ["Do you want to add additional context for why you are reporting this message?\n1. Yes\n2. No"] + + if self.state == State.AWAITING_CONTEXT_CONFIRMATION: + if message.content.strip() == '1': # wants to add context + self.state = State.AWAITING_CONTEXT_TEXT + return ["Please enter additional context (why you are reporting):"] + elif message.content.strip() == '2': # no context + # call start_moderation_flow without context + data = self.pending_report + self.pending_report = None + self.state = State.REPORT_COMPLETE await self.client.start_moderation_flow( - report_type=self.abuse_type.value.upper(), - report_content=self.message.content, - message_author=self.message.author.name + report_type=data['report_type'], + report_content=data['report_content'], + message_author=data['message_author'], + user_context=None ) - self.state = State.REPORT_COMPLETE - return ["Thank you for reporting. This has been sent to our moderation team for review."] + return ["Thank you. Your report has been sent to the moderation team."] + else: + return ["Invalid choice. Reply with 1 (Yes) or 2 (No)."] + + if self.state == State.AWAITING_CONTEXT_TEXT: + ctx_text = message.content.strip() + data = self.pending_report + self.pending_report = None + self.user_context = ctx_text + self.state = State.REPORT_COMPLETE + await self.client.start_moderation_flow( + report_type=data['report_type'], + report_content=data['report_content'], + message_author=data['message_author'], + user_context=ctx_text + ) + return ["Thank you. Your report and context have been sent to the moderation team."] if self.state == State.AWAITING_MISINFO_CATEGORY: category = message.content.strip() diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index f5b25e22..9ad79ac9 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -3,10 +3,10 @@ "total_reports": 1, "reports": [ { - "timestamp": "2025-06-01T12:53:15.987346", + "timestamp": "2025-06-01T23:12:48.395295", "report_type": "NEWS MISINFO - POLITICAL", "report_content": "The recent election was rigged using a secret satellite controlled by a foreign government.", - "outcome": "Post removed and user temporarily muted", + "outcome": "Post removed and user removed", "explanation": "false info" } ] From 005928693b4156b445ef52f72468633d35390720 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Mon, 2 Jun 2025 14:43:55 -0700 Subject: [PATCH 12/17] added classifier source code, works poorly on short text --- .gitignore | 2 + DiscordBot/bot.py | 6 +- DiscordBot/classifier/misinfo_classifier.py | 161 ++++++++++++++++++++ DiscordBot/classifier/requirements.txt | 4 + 4 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 DiscordBot/classifier/misinfo_classifier.py create mode 100644 DiscordBot/classifier/requirements.txt diff --git a/.gitignore b/.gitignore index 8989f962..65c976ba 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.log DiscordBot/tokens.json TABot/tokens.json +*.csv +*.joblib diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index cade0028..39b02b5d 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -8,6 +8,7 @@ import requests from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory, State from user_stats import UserStats +from classifier.misinfo_classifier import predict_misinformation, load_model import pdb import openai @@ -47,6 +48,7 @@ def __init__(self): self.awaiting_appeal_confirmation = {} self.awaiting_appeal_reason = {} self.openai_client = openai.OpenAI(api_key=openai_api_key) + self.model = load_model() async def on_ready(self): @@ -176,7 +178,9 @@ async def handle_channel_message(self, message): if message.channel.name == f'group-{self.group_num}-mod': await self.handle_mod_channel_message(message) elif message.channel.name == f'group-{self.group_num}': - return + pass + # prediction = predict_misinformation(message.content, self.model) + # print(prediction) async def start_moderation_flow( self, diff --git a/DiscordBot/classifier/misinfo_classifier.py b/DiscordBot/classifier/misinfo_classifier.py new file mode 100644 index 00000000..45c6f15e --- /dev/null +++ b/DiscordBot/classifier/misinfo_classifier.py @@ -0,0 +1,161 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, accuracy_score +from sklearn.pipeline import Pipeline +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import re +import joblib +import os + +# Download required NLTK data +nltk.download('punkt') +nltk.download('stopwords') + +def preprocess_text(text): + """Preprocess text by removing special characters, converting to lowercase, and removing stopwords.""" + if isinstance(text, str): + # Convert to lowercase + text = text.lower() + # Remove special characters and digits + text = re.sub(r'[^a-zA-Z\s]', '', text) + # Tokenize + tokens = word_tokenize(text) + # Remove stopwords + stop_words = set(stopwords.words('english')) + tokens = [token for token in tokens if token not in stop_words] + return ' '.join(tokens) + return '' + +def load_and_preprocess_data(): + """Load and preprocess the dataset.""" + # Load the datasets + fake_df = pd.read_csv('DataSet_Misinfo_FAKE.csv') + true_df = pd.read_csv('DataSet_Misinfo_TRUE.csv') + + # Rename columns for consistency + fake_df.columns = ['index', 'text'] + true_df.columns = ['index', 'text'] + + # Add labels + fake_df['label'] = 1 # 1 for fake/misinformation + true_df['label'] = 0 # 0 for true + + # Combine datasets + df = pd.concat([fake_df, true_df], ignore_index=True) + + # Preprocess text + df['processed_text'] = df['text'].apply(preprocess_text) + + return df + +def train_classifier(save_model=True): + """Train and evaluate the misinformation classifier with cross-validation and hyperparameter tuning.""" + # Load and preprocess data + df = load_and_preprocess_data() + + # Split data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split( + df['processed_text'], + df['label'], + test_size=0.2, + random_state=42 + ) + + # Create a pipeline with TF-IDF vectorizer and classifier + pipeline = Pipeline([ + ('tfidf', TfidfVectorizer()), + ('clf', LogisticRegression()) + ]) + + # Define hyperparameter grid + param_grid = { + 'tfidf__max_features': [3000, 5000, 7000], + 'tfidf__ngram_range': [(1, 1), (1, 2)], + 'clf__C': [0.1, 1.0, 10.0], + 'clf__max_iter': [1000] + } + + # Perform grid search with cross-validation + grid_search = GridSearchCV( + pipeline, + param_grid, + cv=5, # 5-fold cross-validation + scoring='accuracy', + n_jobs=-1 # Use all available CPU cores + ) + + print("Performing grid search with cross-validation...") + grid_search.fit(X_train, y_train) + + # Get best parameters and score + print("\nBest parameters:", grid_search.best_params_) + print("Best cross-validation score:", grid_search.best_score_) + + # Evaluate on test set + best_model = grid_search.best_estimator_ + y_pred = best_model.predict(X_test) + + print("\nTest Set Classification Report:") + print(classification_report(y_test, y_pred)) + print("\nTest Set Accuracy:", accuracy_score(y_test, y_pred)) + + # Perform additional cross-validation on the best model + cv_scores = cross_val_score(best_model, X_train, y_train, cv=5) + print("\nCross-validation scores:", cv_scores) + print("Mean CV score:", cv_scores.mean()) + print("CV score std:", cv_scores.std()) + + if save_model: + # Create models directory if it doesn't exist + os.makedirs('models', exist_ok=True) + + # Save the best model and vectorizer + model_path = 'models/misinfo_classifier.joblib' + joblib.dump(best_model, model_path) + print(f"\nModel saved to {model_path}") + + return best_model + +def load_model(model_path='classifier/models/misinfo_classifier.joblib'): + """Load a trained model from file.""" + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model file not found at {model_path}") + return joblib.load(model_path) + +def predict_misinformation(text, model): + """Predict if a given text is misinformation or not.""" + # Make prediction using the loaded model + prediction = model.predict([text])[0] + probability = model.predict_proba([text])[0] + + return { + 'is_misinformation': bool(prediction), + 'confidence': float(probability[prediction]), + 'true_probability': float(probability[0]), + 'fake_probability': float(probability[1]) + } + +if __name__ == "__main__": + # # Train the classifier + # print("Training classifier...") + # model = train_classifier(save_model=True) + + # # Example usage with loaded model + # test_text = "This is an example text to test the classifier." + # result = predict_misinformation(test_text, model) + + # print("\nExample prediction:") + # print(f"Text: {test_text}") + # print(f"Is misinformation: {result['is_misinformation']}") + # print(f"Confidence: {result['confidence']:.2f}") + # print(f"True probability: {result['true_probability']:.2f}") + # print(f"Fake probability: {result['fake_probability']:.2f}") + + model = load_model() + prediction = predict_misinformation("Russia did not interfere in the 2016 presidential election", model) + print(prediction) \ No newline at end of file diff --git a/DiscordBot/classifier/requirements.txt b/DiscordBot/classifier/requirements.txt new file mode 100644 index 00000000..7c2754b0 --- /dev/null +++ b/DiscordBot/classifier/requirements.txt @@ -0,0 +1,4 @@ +pandas>=1.3.0 +numpy>=1.21.0 +scikit-learn>=0.24.2 +nltk>=3.6.0 \ No newline at end of file From 1e3d7634549c7258f4339a2d2dd1b856e3bb210e Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Mon, 2 Jun 2025 15:16:49 -0700 Subject: [PATCH 13/17] added llm based classification, works much better on short text --- DiscordBot/bot.py | 85 ++++++++++++++++++++++++++++++++++---- DiscordBot/user_stats.json | 8 ++-- 2 files changed, 81 insertions(+), 12 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 39b02b5d..79657eea 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -8,9 +8,10 @@ import requests from report import Report, AbuseType, MisinfoCategory, HealthCategory, NewsCategory, State from user_stats import UserStats -from classifier.misinfo_classifier import predict_misinformation, load_model +# from classifier.misinfo_classifier import predict_misinformation, load_model import pdb import openai +import time # Set up logging to the console logger = logging.getLogger('discord') @@ -48,7 +49,7 @@ def __init__(self): self.awaiting_appeal_confirmation = {} self.awaiting_appeal_reason = {} self.openai_client = openai.OpenAI(api_key=openai_api_key) - self.model = load_model() + # self.model = load_model() async def on_ready(self): @@ -178,9 +179,22 @@ async def handle_channel_message(self, message): if message.channel.name == f'group-{self.group_num}-mod': await self.handle_mod_channel_message(message) elif message.channel.name == f'group-{self.group_num}': - pass - # prediction = predict_misinformation(message.content, self.model) - # print(prediction) + # Check for misinformation in all messages + has_misinfo = await self.detect_misinformation(message.content) + + if has_misinfo: + # If misinformation is detected, classify the type for the report + abuse_type_raw = await self.classify_abuse_type(message.content) + abuse_type = self.normalize_abuse_type(abuse_type_raw) + + if abuse_type: + # Start moderation flow for the detected misinformation + await self.start_moderation_flow( + report_type=abuse_type, + report_content=message.content, + message_author=message.author.name, + message_link=message.jump_url + ) async def start_moderation_flow( self, @@ -437,7 +451,7 @@ async def handle_mod_channel_message(self, message): if predicted == 'medium' else 'high_action_on_post') return - if content == '2': # Moderator disagrees with LLM’s danger‐level + if content == '2': # Moderator disagrees with LLM's danger‐level await mod_channel.send( "What is the level of danger for this report?\n" "1. LOW\n" @@ -505,7 +519,7 @@ async def handle_mod_channel_message(self, message): return if step == 'confirm_post_action': - if content == '1': # Mod agrees with LLM’s post‐action + if content == '1': # Mod agrees with LLM's post‐action post_action = ctx.get('predicted_post_action') danger = ctx.get('danger_level') # Retrieve the reported User object @@ -622,7 +636,7 @@ async def handle_mod_channel_message(self, message): return if step == 'confirm_user_action': - if content == '1': # Mod agrees with LLM’s user‐action + if content == '1': # Mod agrees with LLM's user‐action user_action = ctx.get('predicted_user_action') reported_user = discord.utils.get(guild.members, name=reported_user_name) @@ -1068,5 +1082,60 @@ async def classify_user_action(self, report_content, danger_level, post_action, async def prompt_next_moderation_step(self, mod_channel): await mod_channel.send("Moderator, please review the report and respond with your decision.") + async def detect_misinformation(self, message_content): + """ + Uses OpenAI API to detect if a message contains misinformation. + Returns 1 if misinformation is detected, 0 otherwise. + Prints classification outcome and timing. + """ + import time + start_time = time.time() + + system_prompt = ( + "You are a content moderation assistant specialized in detecting misinformation. " + "Your task is to analyze if a message contains misinformation in any of these categories:\n" + "- Health misinformation (emergency, medical research, reproductive health, treatments, alternative medicine)\n" + "- News misinformation (historical, political, scientific)\n" + "- Advertising misinformation\n\n" + "Respond with ONLY a single digit:\n" + "1 - if the message contains misinformation\n" + "0 - if the message does not contain misinformation\n\n" + "Do not provide any explanation, just the number." + ) + + try: + response = self.openai_client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": message_content} + ] + ) + result = response.choices[0].message.content.strip() + prediction = int(result) if result in ['0', '1'] else 0 + end_time = time.time() + processing_time = end_time - start_time + + # Print classification outcome + print(f"\nMessage Classification:") + print(f"Message: {message_content}") + print(f"Prediction: {prediction}") + print(f"Processing time: {processing_time:.2f} seconds") + print("-" * 50) + + return prediction + except Exception as e: + end_time = time.time() + processing_time = end_time - start_time + + # Print error + print(f"\nClassification Error:") + print(f"Message: {message_content}") + print(f"Error: {str(e)}") + print(f"Processing time: {processing_time:.2f} seconds") + print("-" * 50) + + return 0 + client = ModBot() client.run(discord_token) \ No newline at end of file diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 9ad79ac9..a8bf68cc 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1,13 +1,13 @@ { - "1364684437752647793": { + "484531188581793803": { "total_reports": 1, "reports": [ { - "timestamp": "2025-06-01T23:12:48.395295", + "timestamp": "2025-06-02T15:13:04.847343", "report_type": "NEWS MISINFO - POLITICAL", "report_content": "The recent election was rigged using a secret satellite controlled by a foreign government.", - "outcome": "Post removed and user removed", - "explanation": "false info" + "outcome": "Post removed and incident recorded", + "explanation": "bad" } ] } From c0156088ffbd9442e01bf3f54db1c85c1720f3f5 Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Mon, 2 Jun 2025 15:20:44 -0700 Subject: [PATCH 14/17] added parallel processing to make it so that classification can keep happening while a report is being handled in the mod channel --- DiscordBot/bot.py | 40 ++++++++++++++++++++++++-------------- DiscordBot/user_stats.json | 8 ++++---- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 79657eea..bdeaa411 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -12,6 +12,7 @@ import pdb import openai import time +import asyncio # Set up logging to the console logger = logging.getLogger('discord') @@ -179,22 +180,31 @@ async def handle_channel_message(self, message): if message.channel.name == f'group-{self.group_num}-mod': await self.handle_mod_channel_message(message) elif message.channel.name == f'group-{self.group_num}': - # Check for misinformation in all messages - has_misinfo = await self.detect_misinformation(message.content) + # Create a task for message classification that runs independently + asyncio.create_task(self.process_message(message)) + + async def process_message(self, message): + """ + Process a message for misinformation detection independently of the moderation flow. + This runs in parallel with other message processing and moderation tasks. + """ + # Check for misinformation in the message + has_misinfo = await self.detect_misinformation(message.content) + + if has_misinfo: + # If misinformation is detected, classify the type for the report + abuse_type_raw = await self.classify_abuse_type(message.content) + abuse_type = self.normalize_abuse_type(abuse_type_raw) - if has_misinfo: - # If misinformation is detected, classify the type for the report - abuse_type_raw = await self.classify_abuse_type(message.content) - abuse_type = self.normalize_abuse_type(abuse_type_raw) - - if abuse_type: - # Start moderation flow for the detected misinformation - await self.start_moderation_flow( - report_type=abuse_type, - report_content=message.content, - message_author=message.author.name, - message_link=message.jump_url - ) + if abuse_type: + # Start moderation flow for the detected misinformation + # This will run independently of other message processing + await self.start_moderation_flow( + report_type=abuse_type, + report_content=message.content, + message_author=message.author.name, + message_link=message.jump_url + ) async def start_moderation_flow( self, diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index a8bf68cc..9dddb7de 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -3,10 +3,10 @@ "total_reports": 1, "reports": [ { - "timestamp": "2025-06-02T15:13:04.847343", - "report_type": "NEWS MISINFO - POLITICAL", - "report_content": "The recent election was rigged using a secret satellite controlled by a foreign government.", - "outcome": "Post removed and incident recorded", + "timestamp": "2025-06-02T15:19:21.241492", + "report_type": "NEWS MISINFO - SCIENTIFIC", + "report_content": "covid was created in a chinese lab", + "outcome": "Post removed and user temporarily muted", "explanation": "bad" } ] From 60413d2a07e09c4ed9bc15ef048d67201d7a324f Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Mon, 2 Jun 2025 15:32:39 -0700 Subject: [PATCH 15/17] add queue and logging for what reports are currently waiting to happen --- DiscordBot/bot.py | 72 ++++++++++++++++++++++++++++++++------ DiscordBot/user_stats.json | 29 ++++++++++++--- 2 files changed, 86 insertions(+), 15 deletions(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index bdeaa411..219851cc 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -50,8 +50,8 @@ def __init__(self): self.awaiting_appeal_confirmation = {} self.awaiting_appeal_reason = {} self.openai_client = openai.OpenAI(api_key=openai_api_key) - # self.model = load_model() - + # Initialize the report queue + self.report_queue = asyncio.Queue() async def on_ready(self): print(f'{self.user.name} has connected to Discord! It is these guilds:') @@ -71,7 +71,10 @@ async def on_ready(self): for channel in guild.text_channels: if channel.name == f'group-{self.group_num}-mod': self.mod_channels[guild.id] = channel - + + # Start the report queue processor + asyncio.create_task(self.process_report_queue()) + print("Report queue processor started.") async def on_message(self, message): ''' @@ -183,6 +186,48 @@ async def handle_channel_message(self, message): # Create a task for message classification that runs independently asyncio.create_task(self.process_message(message)) + async def process_report_queue(self): + """ + Continuously process reports from the queue. + This ensures reports are handled one at a time. + """ + while True: + try: + # Get the next report from the queue + report_data = await self.report_queue.get() + + print("\n=== Processing Report from Queue ===") + print(f"Queue size before processing: {self.report_queue.qsize()}") + print(f"Processing report for: {report_data['message_author']}") + print(f"Report type: {report_data['report_type']}") + print("================================\n") + + # Wait for any active moderation flow to complete + while self.active_mod_flow is not None: + print("Waiting for active moderation flow to complete...") + await asyncio.sleep(1) # Check every second + + # Process the report + await self.start_moderation_flow( + report_type=report_data['report_type'], + report_content=report_data['report_content'], + message_author=report_data['message_author'], + message_link=report_data.get('message_link'), + user_context=report_data.get('user_context') + ) + + # Mark the task as done + self.report_queue.task_done() + + print("\n=== Report Processing Complete ===") + print(f"Queue size after processing: {self.report_queue.qsize()}") + print("================================\n") + + except Exception as e: + print(f"Error processing report from queue: {e}") + # Continue processing the queue even if one report fails + continue + async def process_message(self, message): """ Process a message for misinformation detection independently of the moderation flow. @@ -197,14 +242,19 @@ async def process_message(self, message): abuse_type = self.normalize_abuse_type(abuse_type_raw) if abuse_type: - # Start moderation flow for the detected misinformation - # This will run independently of other message processing - await self.start_moderation_flow( - report_type=abuse_type, - report_content=message.content, - message_author=message.author.name, - message_link=message.jump_url - ) + # Add the report to the queue instead of processing it directly + report_data = { + 'report_type': abuse_type, + 'report_content': message.content, + 'message_author': message.author.name, + 'message_link': message.jump_url + } + await self.report_queue.put(report_data) + print("\n=== Report Added to Queue ===") + print(f"Author: {message.author.name}") + print(f"Type: {abuse_type}") + print(f"Current queue size: {self.report_queue.qsize()}") + print("============================\n") async def start_moderation_flow( self, diff --git a/DiscordBot/user_stats.json b/DiscordBot/user_stats.json index 9dddb7de..d83a703e 100644 --- a/DiscordBot/user_stats.json +++ b/DiscordBot/user_stats.json @@ -1,13 +1,34 @@ { "484531188581793803": { - "total_reports": 1, + "total_reports": 4, "reports": [ { - "timestamp": "2025-06-02T15:19:21.241492", - "report_type": "NEWS MISINFO - SCIENTIFIC", + "timestamp": "2025-06-02T15:30:40.219246", + "report_type": "NEWS MISINFO - POLITICAL", "report_content": "covid was created in a chinese lab", - "outcome": "Post removed and user temporarily muted", + "outcome": "Post removed and incident recorded", "explanation": "bad" + }, + { + "timestamp": "2025-06-02T15:31:34.897102", + "report_type": "NEWS MISINFO - POLITICAL", + "report_content": "The recent election was rigged using a secret satellite controlled by a foreign government.", + "outcome": "Post removed and incident recorded", + "explanation": "f" + }, + { + "timestamp": "2025-06-02T15:31:50.237193", + "report_type": "NEWS MISINFO - SCIENTIFIC", + "report_content": "covid was created in a chinese lab", + "outcome": "Post removed and incident recorded", + "explanation": "f" + }, + { + "timestamp": "2025-06-02T15:32:04.112294", + "report_type": "NEWS MISINFO - POLITICAL", + "report_content": "obama is not a us citizen", + "outcome": "Post flagged as unproven/non-scientific", + "explanation": "he is" } ] } From 93cf37fa156e7d6092c47c57d0c003609da9a1ea Mon Sep 17 00:00:00 2001 From: michaelsouliman Date: Wed, 4 Jun 2025 12:49:45 -0700 Subject: [PATCH 16/17] changed to use gpt-4o for autoflagging --- DiscordBot/bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index 219851cc..ffc3ef61 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -1165,7 +1165,7 @@ async def detect_misinformation(self, message_content): try: response = self.openai_client.chat.completions.create( - model="gpt-4", + model="gpt-4o", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": message_content} From b52f9efd0db5a7d726f539162f02bd6eb76386bc Mon Sep 17 00:00:00 2001 From: anushehchaudry Date: Sun, 8 Jun 2025 17:27:09 -0700 Subject: [PATCH 17/17] Add policy language into prompt --- DiscordBot/bot.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/DiscordBot/bot.py b/DiscordBot/bot.py index ffc3ef61..c887eb1a 100644 --- a/DiscordBot/bot.py +++ b/DiscordBot/bot.py @@ -984,6 +984,11 @@ async def classify_abuse_type(self, message_content, user_context=None): "- For misinformation types: `HEALTH (EMERGENCY) MISINFORMATION`, `NEWS (POLITICAL) MISINFORMATION`, `ADVERTISEMENT MISINFORMATION`, etc.\n" "- If the message does not fit any of these categories, respond with: `UNKNOWN`\n\n" "Only return the final category label." + "Here is additional information about what is classified as misinformation to aid in your decision, however note that this is not complete and may miss some cases:\n" + "News and Current Events: False representations of ongoing news events. Misrepresenting dangerous natural events.\n" + "Political Content: False information about voting processes. Content supporting voter suppression. Illegitimate questioning of candidate eligibility. Inaccurate political quotes or statements.Encouraging interference with democratic processes.\n" + "Health Information: Dangerous, unproven medical treatments. Content encouraging dangerous health activities. Alternative medicine that has potentially dangerous effects. False medical research." + ) if user_context: user_prompt = f"User context: {user_context}\nMessage: {message_content}\n\nClassify the abuse type:" @@ -1161,6 +1166,10 @@ async def detect_misinformation(self, message_content): "1 - if the message contains misinformation\n" "0 - if the message does not contain misinformation\n\n" "Do not provide any explanation, just the number." + "Here is additional information about what is classified as misinformation to aid in your decision, however note that this is not complete and may miss some cases\n" + "News and Current Events: False representations of ongoing news events. Misrepresenting dangerous natural events.\n" + "Political Content: False information about voting processes. Content supporting voter suppression. Illegitimate questioning of candidate eligibility. Inaccurate political quotes or statements.Encouraging interference with democratic processes.\n" + "Health Information: Dangerous, unproven medical treatments. Content encouraging dangerous health activities. Alternative medicine that has potentially dangerous effects. False medical research." ) try: