zed/script/github-track-duplicate-bot-effectiveness.py
Lena 6741a1df28
Add duplicate bot effectiveness tracking (w/github project) (#49879)
Add a script that classifies open and closed issues that the duplicate
bot commented (or not commented) on and puts them into the appropriate
columns of the dedicated github project board. Add a workflow that calls
that script for every closed issue and also on a schedule (that's for
the open ones).

If you're reading this some time way later and there's no bot running
around the repository leaving comments like “This issue appears to be a
duplicate of...”, you can delete these files.

Release Notes:

- N/A
2026-02-23 13:22:29 +00:00

484 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Track the effectiveness of the duplicate-detection bot by classifying issues
into outcome categories on a GitHub Projects v2 board.
Subcommands:
classify-closed <issue_number> <closer_login> <state_reason>
Classify a closed issue and add it to the project board.
classify-open
Classify open, triaged, bot-commented issues and add them to
the project board as Noise.
Requires:
requests (pip install requests)
Environment variables:
GITHUB_TOKEN - GitHub App token
PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
"""
import argparse
import functools
import os
import re
import sys
import requests
GITHUB_API = "https://api.github.com"
GRAPHQL_URL = "https://api.github.com/graphql"
REPO_OWNER = "zed-industries"
REPO_NAME = "zed"
STAFF_TEAM_SLUG = "staff"
BOT_LOGIN = "zed-community-bot[bot]"
BOT_APP_SLUG = "zed-community-bot"
BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
BOT_START_DATE = "2026-02-18"
NEEDS_TRIAGE_LABEL = "state:needs triage"
DEFAULT_PROJECT_NUMBER = 76
VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
def github_api_get(path, params=None):
url = f"{GITHUB_API}/{path.lstrip('/')}"
response = requests.get(url, headers=GITHUB_HEADERS, params=params)
response.raise_for_status()
return response.json()
def github_search_issues(query):
"""Search issues, returning most recently created first."""
# not handling pagination on purpose: the oldest issues are on the board already
params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
return github_api_get("/search/issues", params).get("items", [])
def is_staff_member(username):
"""Check if user is an active member of the staff team."""
try:
data = github_api_get(
f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
)
return data.get("state") == "active"
except requests.HTTPError as error:
if error.response.status_code == 404:
return False
raise
def fetch_issue(issue_number):
data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
return {
"number": issue_number,
"node_id": data["node_id"],
"author": (data.get("user") or {}).get("login", ""),
"type_name": (data.get("type") or {}).get("name"),
}
def get_bot_duplicate_comment(issue_number):
"""Get the bot's duplicate-detection comment body from an issue.
Returns the comment body if found, else None.
"""
comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
page = 1
while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
for comment in comments:
author = (comment.get("user") or {}).get("login", "")
body = comment.get("body", "")
if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
return body
page += 1
return None
def parse_suggested_issues(comment_body):
"""Extract issue numbers from the bot's comment (lines like '- #12345')."""
return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
def github_api_graphql(query, variables=None):
"""Execute a GitHub GraphQL query. Raises on errors."""
response = requests.post(
GRAPHQL_URL,
headers=GITHUB_HEADERS,
json={"query": query, "variables": variables or {}},
)
response.raise_for_status()
data = response.json()
if "errors" in data:
raise RuntimeError(f"GraphQL errors: {data['errors']}")
return data["data"]
def get_closed_as_duplicate_of(issue_number):
"""Get the issue number this issue was closed as a duplicate of.
Uses the timeline to find the most recent MarkedAsDuplicateEvent.
Returns the original issue number, or None.
Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
If the closer used the "Close as duplicate" button without separately
marking the duplicate relationship, no event is created and this returns
None. The caller handles this by flagging the item for manual review.
"""
data = github_api_graphql(
"""
query($owner: String!, $repo: String!, $number: Int!) {
repository(owner: $owner, name: $repo) {
issue(number: $number) {
timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
nodes {
... on MarkedAsDuplicateEvent {
canonical { ... on Issue { number } }
}
}
}
}
}
}
""",
{"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
)
nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
for node in reversed(nodes):
if original := (node.get("canonical") or {}).get("number"):
return original
return None
@functools.lru_cache
def get_project_config():
"""Fetch the project board's ID, field IDs, and option IDs."""
data = github_api_graphql(
"""
query($org: String!, $number: Int!) {
organization(login: $org) {
projectV2(number: $number) {
id
fields(first: 30) {
nodes {
... on ProjectV2SingleSelectField { id name options { id name } }
... on ProjectV2Field { id name }
}
}
}
}
}
""",
{"org": REPO_OWNER, "number": PROJECT_NUMBER},
)
project = data["organization"]["projectV2"]
config = {"project_id": project["id"], "fields": {}}
for field_node in project["fields"]["nodes"]:
name = field_node.get("name")
if not name:
continue
field_info = {"id": field_node["id"]}
if "options" in field_node:
field_info["options"] = {
option["name"]: option["id"] for option in field_node["options"]
}
config["fields"][name] = field_info
print(f" Project config loaded: {len(config['fields'])} fields")
return config
def find_project_item(issue_node_id):
"""Check if an issue is already on our project board.
Returns the project item ID if found, or None.
"""
data = github_api_graphql(
"query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
{"id": issue_node_id},
)
for item in data["node"]["projectItems"]["nodes"]:
if item["project"]["number"] == PROJECT_NUMBER:
return item["id"]
return None
def add_project_item(issue_node_id):
"""Add an issue to the project board. Returns the new item ID."""
config = get_project_config()
data = github_api_graphql(
"""
mutation($projectId: ID!, $contentId: ID!) {
addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
item { id }
}
}
""",
{"projectId": config["project_id"], "contentId": issue_node_id},
)
return data["addProjectV2ItemById"]["item"]["id"]
def set_field_value(item_id, field_name, value):
"""Set a single field value on a project board item."""
config = get_project_config()
field = config["fields"].get(field_name)
if not field:
print(f" Warning: field '{field_name}' not found on project board")
return
if "options" in field:
# single-select field
option_id = field["options"].get(value)
if not option_id:
print(f" Warning: option '{value}' not found for field '{field_name}'")
return
field_value = {"singleSelectOptionId": option_id}
else:
# text field
field_value = {"text": str(value)}
github_api_graphql(
"""
mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
updateProjectV2ItemFieldValue(input: {
projectId: $projectId
itemId: $itemId
fieldId: $fieldId
value: $value
}) {
projectV2Item { id }
}
}
""",
{
"projectId": config["project_id"],
"itemId": item_id,
"fieldId": field["id"],
"value": field_value,
},
)
def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
"""Add an issue to the project board (or update it if already there), setting field values."""
item_id = find_project_item(issue_node_id)
if item_id:
print(f" Issue already on board, updating (item {item_id})")
else:
item_id = add_project_item(issue_node_id)
print(f" Added to project board (item {item_id})")
set_field_value(item_id, "Outcome", outcome)
set_field_value(item_id, "Status", status)
if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
set_field_value(item_id, "Closed as", closed_as)
if notes:
set_field_value(item_id, "Notes", notes)
return item_id
def classify_closed(issue_number, closer_login, state_reason):
"""Classify a closed issue and add/update it on the project board."""
state_reason = state_reason or "unknown"
print(f"Classifying closed issue #{issue_number}")
print(f" Closer: {closer_login}, state_reason: {state_reason}")
issue = fetch_issue(issue_number)
author = issue["author"]
print(f" Author: {author}, type: {issue['type_name']}")
if is_staff_member(author):
print(f" Skipping: author '{author}' is a staff member")
return
bot_comment = get_bot_duplicate_comment(issue_number)
bot_commented = bot_comment is not None
print(f" Bot commented: {bot_commented}")
closer_is_author = closer_login == author
if bot_commented and closer_is_author:
classify_as_success(issue, state_reason)
elif bot_commented and not closer_is_author:
# Only authors, staff, and triagers can close issues, so
# a non-author closer is always someone with elevated permissions.
classify_non_author_closed(issue, bot_comment, state_reason)
elif not bot_commented and state_reason == "duplicate":
classify_as_missed_opportunity(issue)
else:
print(" Skipping: no bot comment and not closed as duplicate")
def classify_as_success(issue, state_reason):
"""Author closed their own issue after the bot commented."""
if state_reason == "duplicate":
status = "Auto-classified"
notes = None
else:
# could be closed for an unrelated reason; flag for review
status = "Needs review"
notes = f"Author closed as {state_reason}"
if status == "Auto-classified":
print(f" -> Success (closed as {state_reason})")
else:
print(f" -> Possible Success, needs review ({notes})")
add_or_update_project_item(
issue["node_id"],
outcome="Success",
closed_as=state_reason,
status=status,
notes=notes,
)
def classify_non_author_closed(issue, bot_comment, state_reason):
"""Non-author (staff or triager) closed an issue the bot had commented on."""
if state_reason == "duplicate":
classify_as_assist(issue, bot_comment)
else:
notes = f"Closed by staff/triager as {state_reason}, not duplicate"
print(f" -> Possible Noise, needs review ({notes})")
add_or_update_project_item(
issue["node_id"],
outcome="Noise",
closed_as=state_reason,
status="Needs review",
notes=notes,
)
def classify_as_assist(issue, bot_comment):
"""Staff member closed as duplicate after the bot commented. Check if the dup matches."""
suggested = parse_suggested_issues(bot_comment)
original = None
try:
original = get_closed_as_duplicate_of(issue["number"])
except (requests.RequestException, RuntimeError) as error:
print(f" Warning: failed to get the original-for the duplicate issue: {error}")
if original and suggested:
if original in suggested:
status = "Auto-classified"
notes = None
print(f" -> Assist (original #{original} matches bot suggestion)")
else:
status = "Needs review"
suggested_str = ", ".join(f"#{number}" for number in suggested)
notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
print(f" -> Possible Assist, needs review ({notes})")
else:
# couldn't determine original or no suggestions parsed
status = "Needs review"
if not original:
notes = "Could not determine original issue from timeline"
else:
notes = f"Closed as dup of #{original}; could not parse bot suggestions"
print(f" -> Possible Assist, needs review ({notes})")
add_or_update_project_item(
issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
def classify_as_missed_opportunity(issue):
"""Issue closed as duplicate but the bot never commented."""
print(" -> Missed opportunity")
add_or_update_project_item(
issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
def classify_open():
"""Classify open, triaged, bot-commented issues as Noise."""
print("Classifying open issues")
query = (
f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
f"commenter:app/{BOT_APP_SLUG} "
f'-label:"{NEEDS_TRIAGE_LABEL}" '
f"created:>={BOT_START_DATE}"
)
print(f" Search query: {query}")
results = github_search_issues(query)
print(f" Found {len(results)} candidate issues")
added, skipped, errors = 0, 0, 0
for item in results:
number = item["number"]
try:
type_name = (item.get("type") or {}).get("name")
author = (item.get("user") or {}).get("login", "")
node_id = item["node_id"]
skip_reason = (
f"type is {type_name}" if type_name not in ("Bug", "Crash")
else f"author {author} is staff" if is_staff_member(author)
else "already on the board" if find_project_item(node_id)
else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
else None
)
if skip_reason:
print(f" #{number}: skipping, {skip_reason}")
skipped += 1
continue
print(f" #{number}: adding as Noise")
add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
added += 1
except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
print(f" #{number}: error processing issue, skipping: {error}")
errors += 1
print(f" Done: added {added}, skipped {skipped}, errors {errors}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Track duplicate bot effectiveness on a GitHub project board.",
)
subparsers = parser.add_subparsers(dest="command", required=True)
classify_parser = subparsers.add_parser(
"classify-closed",
help="Classify a closed issue and add it to the project board.",
)
classify_parser.add_argument("issue_number", type=int)
classify_parser.add_argument("closer_login")
classify_parser.add_argument("state_reason")
subparsers.add_parser(
"classify-open",
help="Classify open, triaged, bot-commented issues as Noise.",
)
args = parser.parse_args()
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
if not GITHUB_TOKEN:
print("Error: GITHUB_TOKEN environment variable is required")
sys.exit(1)
raw_project_number = os.environ.get("PROJECT_NUMBER", "")
if raw_project_number:
try:
PROJECT_NUMBER = int(raw_project_number)
except ValueError:
print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
sys.exit(1)
else:
PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
GITHUB_HEADERS = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github+json",
}
if args.command == "classify-closed":
classify_closed(args.issue_number, args.closer_login, args.state_reason)
elif args.command == "classify-open":
classify_open()