mirror of
https://github.com/zed-industries/zed.git
synced 2026-05-24 13:39:08 +00:00
490 lines
17 KiB
Python
490 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Track the effectiveness of the duplicate-detection bot by classifying issues
|
|
into outcome categories on a GitHub Projects v2 board.
|
|
|
|
Subcommands:
|
|
classify-closed <issue_number> <closer_login> <state_reason>
|
|
Classify a closed issue and add it to the project board.
|
|
|
|
classify-open
|
|
Classify open, triaged, bot-commented issues and add them to
|
|
the project board as Noise.
|
|
|
|
Requires:
|
|
requests (pip install requests)
|
|
|
|
Environment variables:
|
|
GITHUB_TOKEN - GitHub App token
|
|
PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
|
|
"""
|
|
|
|
import argparse
|
|
import functools
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import requests
|
|
|
|
GITHUB_API = "https://api.github.com"
|
|
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
REPO_OWNER = "zed-industries"
|
|
REPO_NAME = "zed"
|
|
STAFF_TEAM_SLUG = "staff"
|
|
BOT_LOGIN = "zed-community-bot[bot]"
|
|
BOT_APP_SLUG = "zed-community-bot"
|
|
BOT_COMMENT_PREFIX = "This issue appears to be a duplicate of"
|
|
BOT_START_DATE = "2026-02-18"
|
|
NEEDS_TRIAGE_LABEL = "state:needs triage"
|
|
DEFAULT_PROJECT_NUMBER = 76
|
|
VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
|
|
# Bump this when the duplicate-detection bot's behavior changes in a way that
|
|
# could affect outcome rates (e.g. prompt rewrites, model swaps, candidate
|
|
# filtering changes). Don't bump for unrelated changes like comment formatting.
|
|
BOT_VERSION = "v2"
|
|
|
|
|
|
def github_api_get(path, params=None):
|
|
url = f"{GITHUB_API}/{path.lstrip('/')}"
|
|
response = requests.get(url, headers=GITHUB_HEADERS, params=params)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
|
|
def github_search_issues(query):
|
|
"""Search issues, returning most recently created first."""
|
|
# not handling pagination on purpose: the oldest issues are on the board already
|
|
params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
|
|
return github_api_get("/search/issues", params).get("items", [])
|
|
|
|
|
|
def is_staff_member(username):
|
|
"""Check if user is an active member of the staff team."""
|
|
try:
|
|
data = github_api_get(
|
|
f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
|
|
)
|
|
return data.get("state") == "active"
|
|
except requests.HTTPError as error:
|
|
if error.response.status_code == 404:
|
|
return False
|
|
raise
|
|
|
|
|
|
def fetch_issue(issue_number):
|
|
data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
|
|
return {
|
|
"number": issue_number,
|
|
"node_id": data["node_id"],
|
|
"author": (data.get("user") or {}).get("login", ""),
|
|
"type_name": (data.get("type") or {}).get("name"),
|
|
}
|
|
|
|
|
|
def get_bot_duplicate_comment(issue_number):
|
|
"""Get the bot's duplicate-detection comment body from an issue.
|
|
|
|
Returns the comment body if found, else None.
|
|
"""
|
|
comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
|
|
page = 1
|
|
while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
|
|
for comment in comments:
|
|
author = (comment.get("user") or {}).get("login", "")
|
|
body = comment.get("body", "")
|
|
if author == BOT_LOGIN and body.startswith(BOT_COMMENT_PREFIX):
|
|
return body
|
|
page += 1
|
|
return None
|
|
|
|
|
|
def parse_suggested_issues(comment_body):
|
|
"""Extract issue numbers from the bot's comment (lines like '- #12345')."""
|
|
return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
|
|
|
|
|
|
def github_api_graphql(query, variables=None):
|
|
"""Execute a GitHub GraphQL query. Raises on errors."""
|
|
response = requests.post(
|
|
GRAPHQL_URL,
|
|
headers=GITHUB_HEADERS,
|
|
json={"query": query, "variables": variables or {}},
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
if "errors" in data:
|
|
raise RuntimeError(f"GraphQL errors: {data['errors']}")
|
|
return data["data"]
|
|
|
|
|
|
def get_closed_as_duplicate_of(issue_number):
|
|
"""Get the issue number this issue was closed as a duplicate of.
|
|
|
|
Uses the timeline to find the most recent MarkedAsDuplicateEvent.
|
|
Returns the original issue number, or None.
|
|
|
|
Note: not all "closed as duplicate" issues have a MarkedAsDuplicateEvent.
|
|
If the closer used the "Close as duplicate" button without separately
|
|
marking the duplicate relationship, no event is created and this returns
|
|
None. The caller handles this by flagging the item for manual review.
|
|
"""
|
|
data = github_api_graphql(
|
|
"""
|
|
query($owner: String!, $repo: String!, $number: Int!) {
|
|
repository(owner: $owner, name: $repo) {
|
|
issue(number: $number) {
|
|
timelineItems(last: 10, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {
|
|
nodes {
|
|
... on MarkedAsDuplicateEvent {
|
|
canonical { ... on Issue { number } }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""",
|
|
{"owner": REPO_OWNER, "repo": REPO_NAME, "number": issue_number},
|
|
)
|
|
nodes = data["repository"]["issue"]["timelineItems"]["nodes"]
|
|
for node in reversed(nodes):
|
|
if original := (node.get("canonical") or {}).get("number"):
|
|
return original
|
|
return None
|
|
|
|
|
|
@functools.lru_cache
|
|
def get_project_config():
|
|
"""Fetch the project board's ID, field IDs, and option IDs."""
|
|
data = github_api_graphql(
|
|
"""
|
|
query($org: String!, $number: Int!) {
|
|
organization(login: $org) {
|
|
projectV2(number: $number) {
|
|
id
|
|
fields(first: 30) {
|
|
nodes {
|
|
... on ProjectV2SingleSelectField { id name options { id name } }
|
|
... on ProjectV2Field { id name }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""",
|
|
{"org": REPO_OWNER, "number": PROJECT_NUMBER},
|
|
)
|
|
project = data["organization"]["projectV2"]
|
|
|
|
config = {"project_id": project["id"], "fields": {}}
|
|
for field_node in project["fields"]["nodes"]:
|
|
name = field_node.get("name")
|
|
if not name:
|
|
continue
|
|
field_info = {"id": field_node["id"]}
|
|
if "options" in field_node:
|
|
field_info["options"] = {
|
|
option["name"]: option["id"] for option in field_node["options"]
|
|
}
|
|
config["fields"][name] = field_info
|
|
|
|
print(f" Project config loaded: {len(config['fields'])} fields")
|
|
return config
|
|
|
|
|
|
def find_project_item(issue_node_id):
|
|
"""Check if an issue is already on our project board.
|
|
|
|
Returns the project item ID if found, or None.
|
|
"""
|
|
data = github_api_graphql(
|
|
"query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
|
|
{"id": issue_node_id},
|
|
)
|
|
for item in data["node"]["projectItems"]["nodes"]:
|
|
if item["project"]["number"] == PROJECT_NUMBER:
|
|
return item["id"]
|
|
return None
|
|
|
|
|
|
def add_project_item(issue_node_id):
|
|
"""Add an issue to the project board. Returns the new item ID."""
|
|
config = get_project_config()
|
|
data = github_api_graphql(
|
|
"""
|
|
mutation($projectId: ID!, $contentId: ID!) {
|
|
addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
|
|
item { id }
|
|
}
|
|
}
|
|
""",
|
|
{"projectId": config["project_id"], "contentId": issue_node_id},
|
|
)
|
|
return data["addProjectV2ItemById"]["item"]["id"]
|
|
|
|
|
|
def set_field_value(item_id, field_name, value):
|
|
"""Set a single field value on a project board item."""
|
|
config = get_project_config()
|
|
field = config["fields"].get(field_name)
|
|
if not field:
|
|
print(f" Warning: field '{field_name}' not found on project board")
|
|
return
|
|
|
|
if "options" in field:
|
|
# single-select field
|
|
option_id = field["options"].get(value)
|
|
if not option_id:
|
|
print(f" Warning: option '{value}' not found for field '{field_name}'")
|
|
return
|
|
field_value = {"singleSelectOptionId": option_id}
|
|
else:
|
|
# text field
|
|
field_value = {"text": str(value)}
|
|
|
|
github_api_graphql(
|
|
"""
|
|
mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
|
|
updateProjectV2ItemFieldValue(input: {
|
|
projectId: $projectId
|
|
itemId: $itemId
|
|
fieldId: $fieldId
|
|
value: $value
|
|
}) {
|
|
projectV2Item { id }
|
|
}
|
|
}
|
|
""",
|
|
{
|
|
"projectId": config["project_id"],
|
|
"itemId": item_id,
|
|
"fieldId": field["id"],
|
|
"value": field_value,
|
|
},
|
|
)
|
|
|
|
|
|
def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None):
|
|
"""Add an issue to the project board (or update it if already there), setting field values."""
|
|
item_id = find_project_item(issue_node_id)
|
|
if item_id:
|
|
print(f" Issue already on board, updating (item {item_id})")
|
|
else:
|
|
item_id = add_project_item(issue_node_id)
|
|
print(f" Added to project board (item {item_id})")
|
|
|
|
set_field_value(item_id, "Outcome", outcome)
|
|
set_field_value(item_id, "Status", status)
|
|
|
|
if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
|
|
set_field_value(item_id, "Closed as", closed_as)
|
|
|
|
if notes:
|
|
set_field_value(item_id, "Notes", notes)
|
|
|
|
set_field_value(item_id, "Bot version", BOT_VERSION)
|
|
|
|
return item_id
|
|
|
|
|
|
def classify_closed(issue_number, closer_login, state_reason):
|
|
"""Classify a closed issue and add/update it on the project board."""
|
|
state_reason = state_reason or "unknown"
|
|
print(f"Classifying closed issue #{issue_number}")
|
|
print(f" Closer: {closer_login}, state_reason: {state_reason}")
|
|
|
|
issue = fetch_issue(issue_number)
|
|
author = issue["author"]
|
|
print(f" Author: {author}, type: {issue['type_name']}")
|
|
|
|
if is_staff_member(author):
|
|
print(f" Skipping: author '{author}' is a staff member")
|
|
return
|
|
|
|
bot_comment = get_bot_duplicate_comment(issue_number)
|
|
bot_commented = bot_comment is not None
|
|
print(f" Bot commented: {bot_commented}")
|
|
|
|
closer_is_author = closer_login == author
|
|
|
|
if bot_commented and closer_is_author:
|
|
classify_as_success(issue, state_reason)
|
|
elif bot_commented and not closer_is_author:
|
|
# Only authors, staff, and triagers can close issues, so
|
|
# a non-author closer is always someone with elevated permissions.
|
|
classify_non_author_closed(issue, bot_comment, state_reason)
|
|
elif not bot_commented and state_reason == "duplicate":
|
|
classify_as_missed_opportunity(issue)
|
|
else:
|
|
print(" Skipping: no bot comment and not closed as duplicate")
|
|
|
|
|
|
def classify_as_success(issue, state_reason):
|
|
"""Author closed their own issue after the bot commented."""
|
|
if state_reason == "duplicate":
|
|
status = "Auto-classified"
|
|
notes = None
|
|
else:
|
|
# could be closed for an unrelated reason; flag for review
|
|
status = "Needs review"
|
|
notes = f"Author closed as {state_reason}"
|
|
|
|
if status == "Auto-classified":
|
|
print(f" -> Success (closed as {state_reason})")
|
|
else:
|
|
print(f" -> Possible Success, needs review ({notes})")
|
|
add_or_update_project_item(
|
|
issue["node_id"],
|
|
outcome="Success",
|
|
closed_as=state_reason,
|
|
status=status,
|
|
notes=notes,
|
|
)
|
|
|
|
|
|
def classify_non_author_closed(issue, bot_comment, state_reason):
|
|
"""Non-author (staff or triager) closed an issue the bot had commented on."""
|
|
if state_reason == "duplicate":
|
|
classify_as_assist(issue, bot_comment)
|
|
else:
|
|
notes = f"Closed by staff/triager as {state_reason}, not duplicate"
|
|
print(f" -> Possible Noise, needs review ({notes})")
|
|
add_or_update_project_item(
|
|
issue["node_id"],
|
|
outcome="Noise",
|
|
closed_as=state_reason,
|
|
status="Needs review",
|
|
notes=notes,
|
|
)
|
|
|
|
|
|
def classify_as_assist(issue, bot_comment):
|
|
"""Staff member closed as duplicate after the bot commented. Check if the dup matches."""
|
|
suggested = parse_suggested_issues(bot_comment)
|
|
original = None
|
|
try:
|
|
original = get_closed_as_duplicate_of(issue["number"])
|
|
except (requests.RequestException, RuntimeError) as error:
|
|
print(f" Warning: failed to get the original-for the duplicate issue: {error}")
|
|
|
|
if original and suggested:
|
|
if original in suggested:
|
|
status = "Auto-classified"
|
|
notes = None
|
|
print(f" -> Assist (original #{original} matches bot suggestion)")
|
|
else:
|
|
status = "Needs review"
|
|
suggested_str = ", ".join(f"#{number}" for number in suggested)
|
|
notes = f"Bot suggested {suggested_str}; closed as dup of #{original}"
|
|
print(f" -> Possible Assist, needs review ({notes})")
|
|
else:
|
|
# couldn't determine original or no suggestions parsed
|
|
status = "Needs review"
|
|
if not original:
|
|
notes = "Could not determine original issue from timeline"
|
|
else:
|
|
notes = f"Closed as dup of #{original}; could not parse bot suggestions"
|
|
print(f" -> Possible Assist, needs review ({notes})")
|
|
|
|
add_or_update_project_item(
|
|
issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes)
|
|
|
|
|
|
def classify_as_missed_opportunity(issue):
|
|
"""Issue closed as duplicate but the bot never commented."""
|
|
print(" -> Missed opportunity")
|
|
add_or_update_project_item(
|
|
issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified")
|
|
|
|
|
|
def classify_open():
|
|
"""Classify open, triaged, bot-commented issues as Noise."""
|
|
print("Classifying open issues")
|
|
|
|
query = (
|
|
f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
|
|
f"commenter:app/{BOT_APP_SLUG} "
|
|
f'-label:"{NEEDS_TRIAGE_LABEL}" '
|
|
f"created:>={BOT_START_DATE}"
|
|
)
|
|
print(f" Search query: {query}")
|
|
|
|
results = github_search_issues(query)
|
|
print(f" Found {len(results)} candidate issues")
|
|
|
|
added, skipped, errors = 0, 0, 0
|
|
for item in results:
|
|
number = item["number"]
|
|
try:
|
|
type_name = (item.get("type") or {}).get("name")
|
|
author = (item.get("user") or {}).get("login", "")
|
|
node_id = item["node_id"]
|
|
|
|
skip_reason = (
|
|
f"type is {type_name}" if type_name not in ("Bug", "Crash")
|
|
else f"author {author} is staff" if is_staff_member(author)
|
|
else "already on the board" if find_project_item(node_id)
|
|
else "no bot duplicate comment found" if not get_bot_duplicate_comment(number)
|
|
else None
|
|
)
|
|
if skip_reason:
|
|
print(f" #{number}: skipping, {skip_reason}")
|
|
skipped += 1
|
|
continue
|
|
|
|
print(f" #{number}: adding as Noise")
|
|
add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified")
|
|
added += 1
|
|
except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
|
|
print(f" #{number}: error processing issue, skipping: {error}")
|
|
errors += 1
|
|
|
|
print(f" Done: added {added}, skipped {skipped}, errors {errors}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Track duplicate bot effectiveness on a GitHub project board.",
|
|
)
|
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
|
|
classify_parser = subparsers.add_parser(
|
|
"classify-closed",
|
|
help="Classify a closed issue and add it to the project board.",
|
|
)
|
|
classify_parser.add_argument("issue_number", type=int)
|
|
classify_parser.add_argument("closer_login")
|
|
classify_parser.add_argument("state_reason")
|
|
|
|
subparsers.add_parser(
|
|
"classify-open",
|
|
help="Classify open, triaged, bot-commented issues as Noise.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
|
|
if not GITHUB_TOKEN:
|
|
print("Error: GITHUB_TOKEN environment variable is required")
|
|
sys.exit(1)
|
|
|
|
raw_project_number = os.environ.get("PROJECT_NUMBER", "")
|
|
if raw_project_number:
|
|
try:
|
|
PROJECT_NUMBER = int(raw_project_number)
|
|
except ValueError:
|
|
print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
|
|
sys.exit(1)
|
|
else:
|
|
PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
|
|
|
|
GITHUB_HEADERS = {
|
|
"Authorization": f"token {GITHUB_TOKEN}",
|
|
"Accept": "application/vnd.github+json",
|
|
}
|
|
|
|
if args.command == "classify-closed":
|
|
classify_closed(args.issue_number, args.closer_login, args.state_reason)
|
|
elif args.command == "classify-open":
|
|
classify_open()
|