refactoring

This commit is contained in:
2026-05-02 16:24:42 +09:00
parent 296adf3073
commit 859c39fe0c
194 changed files with 5267 additions and 0 deletions

6
crawler/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
"""
crawler/ — 네이버 스포츠 API 크롤링 패키지
네이버 API에서 데이터를 수집하고, relay 데이터를 파싱하여
정규화된 JSON 리포트를 생성합니다.
"""

116
crawler/lineup_builder.py Normal file
View File

@@ -0,0 +1,116 @@
"""
crawler/lineup_builder.py — 라인업 데이터 구성
relay 데이터와 preview 데이터에서 라인업 정보를 추출합니다.
"""
from __future__ import annotations
from typing import Any
from crawler.naver_api import get_team_names
def get_starting_pitcher(pitchers: list[dict[str, Any]]) -> dict[str, Any] | None:
"""투수 리스트에서 선발투수 추출"""
if not pitchers:
return None
return min(pitchers, key=lambda p: p.get("seqno", 999))
def get_starting_batters(batters: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""타자 리스트에서 선발 라인업 추출"""
starters_by_order: dict[int, dict[str, Any]] = {}
for batter in sorted(batters, key=lambda b: (b.get("batOrder", 999), b.get("seqno", 999))):
bat_order = batter.get("batOrder")
if bat_order is None or bat_order in starters_by_order:
continue
starters_by_order[bat_order] = batter
return [starters_by_order[order] for order in sorted(starters_by_order)]
def build_lineup_team(team_name: str, lineup: dict[str, Any]) -> dict[str, Any]:
"""relay 데이터의 라인업 → 정규화된 팀 라인업 dict"""
starter_pitcher = get_starting_pitcher(lineup.get("pitcher", []))
starting_batters = get_starting_batters(lineup.get("batter", []))
return {
"team_name": team_name,
"starter_pitcher": {
"name": starter_pitcher.get("name"),
"position": "투수",
"number": starter_pitcher.get("backnum"),
}
if starter_pitcher
else None,
"players": [
{
"bat_order": batter.get("batOrder"),
"name": batter.get("name"),
"position": batter.get("posName"),
"number": batter.get("backnum"),
}
for batter in starting_batters
],
}
def build_preview_lineup_team(
team_name: str, preview_lineup: dict[str, Any] | None,
) -> dict[str, Any] | None:
"""preview 데이터의 라인업 → 정규화된 팀 라인업 dict"""
if not preview_lineup:
return None
full_lineup = preview_lineup.get("fullLineUp") or []
starter_pitcher = next(
(
player
for player in full_lineup
if player.get("positionName") == "선발투수"
or int(player.get("batorder", 0) or 0) == 0
),
None,
)
batters = sorted(
(player for player in full_lineup if int(player.get("batorder", 0) or 0) > 0),
key=lambda p: int(p.get("batorder", 99) or 99),
)
return {
"team_name": team_name,
"starter_pitcher": {
"name": starter_pitcher.get("playerName"),
"position": "투수",
"number": starter_pitcher.get("backnum"),
}
if starter_pitcher
else None,
"players": [
{
"bat_order": int(player.get("batorder")),
"name": player.get("playerName"),
"position": player.get("positionName"),
"number": player.get("backnum"),
}
for player in batters
],
}
def build_lineup_summary(
game_id: str,
game_info: dict[str, Any],
relay_data: dict[str, Any],
preview_data: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""전체 라인업 요약 생성 (preview 우선, relay 폴백)"""
away_name, home_name = get_team_names(game_id, game_info)
away_preview = build_preview_lineup_team(
away_name, (preview_data or {}).get("awayTeamLineUp"),
)
home_preview = build_preview_lineup_team(
home_name, (preview_data or {}).get("homeTeamLineUp"),
)
return {
"away_team": away_preview or build_lineup_team(away_name, relay_data["awayLineup"]),
"home_team": home_preview or build_lineup_team(home_name, relay_data["homeLineup"]),
}

197
crawler/naver_api.py Normal file
View File

@@ -0,0 +1,197 @@
"""
crawler/naver_api.py — 네이버 스포츠 API HTTP 클라이언트
모든 네이버 API 호출을 캡슐화합니다.
"""
from __future__ import annotations
import re
from datetime import datetime
from typing import Any
import httpx
from core.config_loader import (
crawler_headers,
game_type_map,
kbo_sr_id_candidates,
result_labels,
team_code_map,
)
BASE_URL = "https://api-gw.sports.naver.com/schedule/games"
KBO_URL = "https://www.koreabaseball.com/ws/Schedule.asmx/GetScoreBoardScroll"
class NaverApiClient:
"""네이버 스포츠 API 클라이언트
httpx.Client를 래핑하여 게임 정보, relay, 라인업, 기록 등을 가져옵니다.
with 문으로 사용하세요:
with NaverApiClient() as api:
relay = api.fetch_relay(game_id)
"""
def __init__(self, timeout: float = 20.0):
self._client: httpx.Client | None = None
self._timeout = timeout
def __enter__(self) -> "NaverApiClient":
self._client = httpx.Client(headers=crawler_headers(), timeout=self._timeout)
return self
def __exit__(self, *args: Any) -> None:
if self._client:
self._client.close()
self._client = None
@property
def client(self) -> httpx.Client:
if self._client is None:
raise RuntimeError("NaverApiClient는 with 문 안에서 사용하세요.")
return self._client
def _get_json(self, url: str) -> dict[str, Any]:
resp = self.client.get(url)
resp.raise_for_status()
return resp.json()
# ──────────────────────────────────────────
# 게임 정보
# ──────────────────────────────────────────
def fetch_game_info(self, game_id: str) -> dict[str, Any]:
"""게임 기본 정보"""
payload = self._get_json(f"{BASE_URL}/{game_id}")
return payload["result"]["game"]
def fetch_relay(self, game_id: str, inning: int | None = None) -> dict[str, Any]:
"""relay 데이터 (전체 또는 특정 이닝)"""
url = f"{BASE_URL}/{game_id}/relay"
if inning is not None:
url += f"?inning={inning}"
payload = self._get_json(url)
return payload["result"]["textRelayData"]
def fetch_record(self, game_id: str) -> dict[str, Any]:
"""기록 데이터 (투수/타자 기록)"""
payload = self._get_json(f"{BASE_URL}/{game_id}/record?fields=all")
return payload["result"]["recordData"]
def fetch_preview(self, game_id: str) -> dict[str, Any]:
"""프리뷰 데이터 (예비 라인업 포함)"""
payload = self._get_json(f"{BASE_URL}/{game_id}/preview")
return payload["result"].get("previewData") or {}
# ──────────────────────────────────────────
# KBO 공식 사이트 데이터
# ──────────────────────────────────────────
def fetch_kbo_review_meta(
self, game_id: str, game_info: dict[str, Any],
) -> dict[str, Any]:
"""KBO 공식 사이트에서 종료시간/관중수 등 메타 정보 조회"""
game_type = infer_game_type(game_info)
candidates = kbo_sr_id_candidates().get(game_type, kbo_sr_id_candidates()["정규경기"])
kbo_game_id = to_kbo_game_id(game_id)
for sr_id in candidates:
resp = self.client.post(
KBO_URL,
data={
"leId": "1",
"srId": sr_id,
"seasonId": str(game_info.get("seasonYear") or ""),
"gameId": kbo_game_id,
},
)
resp.raise_for_status()
payload = resp.json()
if str(payload.get("code")) != "100":
continue
if not any(payload.get(key) for key in ("END_TM", "START_TM", "USE_TM", "CROWD_CN")):
continue
return payload
return {}
# ──────────────────────────────────────────────
# 유틸리티 함수 (순수)
# ──────────────────────────────────────────────
def clean_game_id(game_id: str) -> str:
"""game_id에서 알파벳+숫자만 추출"""
return "".join(re.findall(r"[A-Za-z0-9]", game_id))
def get_team_names(
game_id: str, game_info: dict[str, Any] | None = None,
) -> tuple[str, str]:
"""game_id 또는 game_info에서 원정/홈 팀명 추출"""
if game_info:
return game_info["awayTeamName"], game_info["homeTeamName"]
code_map = team_code_map()
away_code = game_id[8:10]
home_code = game_id[10:12]
return code_map.get(away_code, away_code), code_map.get(home_code, home_code)
def infer_game_type(game_info: dict[str, Any]) -> str:
"""게임 정보에서 경기유형 추론"""
round_code = str(game_info.get("roundCode") or "").lower()
round_name = str(game_info.get("roundName") or "").strip()
if round_name:
return round_name
gt_map = game_type_map()
for key, label in gt_map.items():
if key in round_code:
return label
return "정규경기"
def to_kbo_game_id(game_id: str) -> str:
"""네이버 game_id → KBO 공식 game_id"""
return f"{game_id[:12]}0"
def build_iso_datetime(game_date: str | None, hhmm: str | None) -> str | None:
"""날짜 + 시:분 → ISO datetime 문자열"""
if not game_date or not hhmm:
return None
time_text = hhmm.strip()
if not time_text or ":" not in time_text:
return None
hour_text, minute_text = time_text.split(":", 1)
try:
dt = datetime.fromisoformat(f"{game_date}T{int(hour_text):02d}:{int(minute_text):02d}:00")
except ValueError:
return None
return dt.isoformat()
def derive_umpires(record_data: dict[str, Any]) -> dict[str, str | None]:
"""기록 데이터에서 심판 정보 추출"""
umpire_record = next(
(item for item in record_data.get("etcRecords", []) if item.get("how") == "심판"),
None,
)
names = umpire_record.get("result", "").split() if umpire_record else []
return {
"chief": names[0] if len(names) > 0 else None,
"first_base": names[1] if len(names) > 1 else None,
"second_base": names[2] if len(names) > 2 else None,
"third_base": names[3] if len(names) > 3 else None,
}
def extract_pitching_summary(record_data: dict[str, Any]) -> dict[str, list[str]]:
"""기록 데이터에서 투수 결과 요약 추출"""
label_map = result_labels()
summary: dict[str, list[str]] = {"승리투수": [], "패전투수": [], "홀드": [], "세이브": []}
for pitcher in record_data.get("pitchingResult", []):
label = label_map.get(pitcher.get("wls"))
if label and label in summary:
summary[label].append(pitcher["name"])
return summary

535
crawler/relay_parser.py Normal file
View File

@@ -0,0 +1,535 @@
"""
crawler/relay_parser.py — relay 데이터 파싱
네이버 textRelays를 분석하여 이닝별/타석별 구조화된 이벤트로 변환합니다.
"""
from __future__ import annotations
import re
from collections import defaultdict
from typing import Any
from core.config_loader import (
skip_option_types,
hidden_event_texts,
change_keywords,
max_inning,
)
from core.review_parser import parse_review_event_text
# ──────────────────────────────────────────────
# 정렬 키
# ──────────────────────────────────────────────
def _option_seqno(option: dict[str, Any]) -> int:
return int(option.get("seqno", -1))
def _relay_seqno(relay: dict[str, Any]) -> int:
seqnos = [
_option_seqno(opt)
for opt in relay.get("textOptions", [])
if opt.get("seqno") is not None
]
return min(seqnos) if seqnos else -1
# ──────────────────────────────────────────────
# 제목 추출
# ──────────────────────────────────────────────
def get_half_inning_title(
relays: list[dict[str, Any]], inning: int, home_or_away: int,
) -> str:
"""이닝 시작 릴레이에서 제목 추출"""
for relay in relays:
for opt in relay.get("textOptions", []):
if opt.get("type") == 0:
return opt.get("text", "").strip()
half_label = "" if home_or_away == 0 else ""
return f"{inning}{half_label}"
def _get_batter_title(relay: dict[str, Any], options: list[dict[str, Any]]) -> str:
"""릴레이 블록에서 타자 이름/제목 추출"""
batter_title = next(
(opt.get("text", "").strip() for opt in options if opt.get("type") == 8),
"",
)
if batter_title:
return batter_title
title = (relay.get("title") or "").strip()
if title and "공격" not in title and not title.startswith("="):
return title
return ""
# ──────────────────────────────────────────────
# 투구/주루/교체 파싱
# ──────────────────────────────────────────────
def _format_pitch_text(option: dict[str, Any]) -> str:
"""투구 옵션 → 포맷된 텍스트"""
text = option.get("text", "").strip()
speed = str(option.get("speed") or "").strip()
stuff = str(option.get("stuff") or "").strip()
details = []
if speed:
details.append(f"{speed}km")
if stuff:
details.append(stuff)
return f"{text} ({', '.join(details)})" if details else text
def _classify_pitch_result(text: str, code: str | None) -> str:
"""투구 결과 텍스트 + 코드 → 정규화된 결과 코드"""
normalized = text.replace(" ", "")
if any(key in normalized for key in ("번트헛스윙", "헛스윙번트", "번트시도스트라이크")):
return "BS"
if any(key in normalized for key in ("번트파울", "번트파울.")):
return "BF"
if code in {"BS", "BF", "B", "T", "S", "F", "H"}:
return code
if code and code != "V":
return code
mapping = {
"번트 헛스윙": "BS",
"번트헛스윙": "BS",
"번트 파울": "BF",
"번트파울": "BF",
"": "B",
"스트라이크": "T",
"헛스윙": "S",
"파울": "F",
"타격": "H",
}
for key, value in mapping.items():
if key in text:
return value
return ""
def _classify_result_type(text: str) -> str:
"""결과 텍스트 → result.type 코드"""
clean_text = text.replace(" ", "")
if "낫아웃" in clean_text:
return "strikeout_not_out"
if "고의사구" in text:
return "intentional_walk"
if "볼넷" in text:
return "walk"
if "삼진" in text:
return "strikeout"
if any(k in text for k in ["몸에 맞는 볼", "몸에 맞는 공", "사구", "헤드샷"]):
return "hit_by_pitch"
if "홈런" in text:
return "home_run"
if "3루타" in text:
return "triple"
if "2루타" in text:
return "double"
if "번트안타" in text:
return "bunt_hit"
if "1루타" in text or "내야안타" in text:
return "single"
if "실책" in text and "출루" in text:
return "reach_on_error"
if "야수선택" in text:
return "reach_on_fielder_choice"
if "땅볼로 출루" in text or "땅볼출루" in text:
return "reach_on_grounder"
if "희생번트" in text:
return "sacrifice_bunt"
if "희생플라이" in text:
return "sacrifice_fly"
if "병살타" in text:
return "double_play"
if any(k in text for k in [
"플라이 아웃", "땅볼 아웃", "인필드플라이 아웃",
"라인드라이브 아웃", "직선타 아웃", "라인드라이브", "직선타",
]):
return "out"
return "play"
def _parse_runner_event(text: str) -> dict[str, Any]:
"""주루 이벤트 텍스트 → 구조화된 dict"""
event_type = "runner_event"
if "도루" in text:
event_type = "steal_fail" if "실패" in text else "steal"
elif "홈인" in text:
event_type = "score"
elif "포스아웃" in text:
event_type = "force_out"
elif "견제사" in text:
event_type = "pickoff_out"
elif "태그아웃" in text:
event_type = "tag_out"
elif "실책" in text:
event_type = "error_advance"
elif "폭투" in text:
event_type = "wild_pitch_advance"
elif "포일" in text:
event_type = "passed_ball_advance"
elif "진루" in text:
event_type = "advance"
from_base = None
to_base = None
for label, base in (("1루주자", 1), ("2루주자", 2), ("3루주자", 3), ("1루", 1), ("2루", 2), ("3루", 3)):
if label in text and from_base is None:
from_base = base
for label, base in (("1루까지", 1), ("2루까지", 2), ("3루까지", 3)):
if label in text:
to_base = base
if "홈인" in text:
to_base = 4
runner_name = (
text.split(" : ", 1)[0]
.replace("1루주자 ", "")
.replace("2루주자 ", "")
.replace("3루주자 ", "")
.replace("대주자 ", "")
.strip()
)
extra_advance = 0
if "주자의 재치로" in text and from_base is not None and to_base is not None:
extra_advance = max(0, to_base - from_base)
# action_label: 관리자 사이트 버튼 라벨 매핑
clean_text = text.replace(" ", "")
if "실책으로" in clean_text:
action_label = "수비 실책"
elif "도루" in clean_text:
action_label = "도루성공" if "실패" not in clean_text else "도루시도 아웃"
elif "폭투" in clean_text:
action_label = "폭투-진루성공"
elif "포일" in clean_text:
action_label = "포일-진루성공"
elif "태그" in clean_text:
action_label = "태그아웃"
elif "포스" in clean_text:
action_label = "포스아웃"
elif "견제" in clean_text:
action_label = "견제 아웃"
elif any(k in clean_text for k in ["볼넷", "포볼", "고의사구", "몸에맞는", "사구"]):
action_label = "볼넷 진루"
else:
action_label = "일반 진루"
return {
"type": event_type,
"runner": runner_name,
"fromBase": from_base,
"toBase": to_base,
"extra_advance": extra_advance,
"text": text,
"action_label": action_label,
}
def _parse_change_event(text: str) -> dict[str, Any]:
"""교체 텍스트 → 구조화된 dict"""
event: dict[str, Any] = {
"event_type": "change",
"change_type": "position_change" if "수비위치 변경" in text else "substitution",
"text": text,
}
actor_role, batter_order, actor_name = _extract_change_actor(text)
event["actor_role"] = actor_role
event["actor_name"] = actor_name
if batter_order:
event["bat_order"] = int(batter_order)
if "수비위치 변경" in text:
to_position = text.split(" : ", 1)[1].split("(으)로", 1)[0].strip()
event["player_name"] = actor_name
event["to_position"] = to_position
return event
rhs = text.split(" : ", 1)[1].split("(으)로 교체", 1)[0].strip()
in_role, _, in_name = _extract_change_actor(rhs)
event["out_player"] = actor_name
event["in_player"] = in_name
event["in_role"] = in_role
field_roles = {"투수", "포수", "1루수", "2루수", "3루수", "유격수", "좌익수", "중견수", "우익수"}
if actor_role in field_roles and in_role == "투수":
event["change_type"] = "merged_pitcher_substitution"
event["player_name"] = actor_name
event["to_position"] = "지명타자"
event["pitcher_in_player"] = in_name
return event
extra_roles = field_roles | {"대타", "대주자"}
if in_role in extra_roles:
event["to_position"] = in_role if in_role not in {"대타", "대주자"} else None
return event
def _extract_change_actor(text: str) -> tuple[str | None, str | None, str]:
"""교체 텍스트에서 역할/타순/이름 추출"""
lhs = text.split(" : ", 1)[0].strip()
if "번타자 " in lhs:
order_match = re.search(r"(\d+)번타자\s+(.+)$", lhs)
if order_match:
return "batter", order_match.group(1), order_match.group(2).strip()
for role in (
"대타", "대주자", "1루주자", "2루주자", "3루주자", "주자",
"투수", "포수", "1루수", "2루수", "3루수",
"유격수", "좌익수", "중견수", "우익수",
):
if lhs.startswith(role + " "):
return role, None, lhs[len(role):].strip()
return None, None, lhs
# ──────────────────────────────────────────────
# 주루 이벤트 병합
# ──────────────────────────────────────────────
def _merge_runner_events(runner_events: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""동일 주자의 이벤트를 병합"""
merged: dict[str, dict[str, Any]] = {}
for r in runner_events:
name = r.get("runner")
if not name:
continue
if name in merged:
merged[name]["type"] = r.get("type", merged[name]["type"])
merged[name]["text"] += f" / {r.get('text', '')}"
if r.get("toBase"):
merged[name]["toBase"] = r["toBase"]
if r.get("extra_advance"):
merged[name]["extra_advance"] = r["extra_advance"]
if "태그아웃" in r.get("text", "") or r.get("type") == "tag_out":
merged[name]["type"] = "tag_out"
else:
merged[name] = dict(r)
return list(merged.values())
# ──────────────────────────────────────────────
# 릴레이 → 이벤트 리스트 변환
# ──────────────────────────────────────────────
def build_relay_events(relay: dict[str, Any]) -> list[dict[str, Any]]:
"""하나의 릴레이 블록 → 타석/교체 이벤트 리스트"""
skip_types = skip_option_types()
hidden_texts = hidden_event_texts()
chg_keywords = change_keywords()
options = sorted(relay.get("textOptions", []), key=_option_seqno)
# 1. 세그먼트 분리 (pitchNum 1이 새로 나오면 타자가 바뀐 것)
segments: list[list[dict[str, Any]]] = []
current_segment: list[dict[str, Any]] = []
for opt in options:
opt_type = opt.get("type")
if opt_type == 1 and opt.get("pitchNum") == 1:
if any(o.get("type") == 1 for o in current_segment):
segments.append(current_segment)
current_segment = []
current_segment.append(opt)
if current_segment:
segments.append(current_segment)
# 2. 각 세그먼트별 이벤트 생성
results: list[dict[str, Any]] = []
relay_batter_title = _get_batter_title(relay, options)
for i, seg_options in enumerate(segments):
seg_changes: list[dict[str, Any]] = []
seg_event_texts: list[str] = []
seg_pitches: list[dict[str, Any]] = []
seg_runner_events: list[dict[str, Any]] = []
seg_review_events: list[dict[str, Any]] = []
seg_extra_events: list[dict[str, Any]] = []
seg_result_text: str | None = None
seg_batter_name: str | None = next(
(o.get("text", "").strip() for o in seg_options if o.get("type") == 8),
None,
)
for opt in seg_options:
ot = opt.get("type")
txt = opt.get("text", "").strip()
if not txt or ot in skip_types:
continue
if txt in hidden_texts:
continue
if any(k in txt for k in chg_keywords):
seg_changes.append(_parse_change_event(txt))
continue
if ot == 1:
seg_event_texts.append(_format_pitch_text(opt))
seg_pitches.append({
"pitchNo": opt.get("pitchNum"),
"pitchResult": _classify_pitch_result(txt, opt.get("pitchResult")),
"pitchResultText": txt.replace(f"{opt.get('pitchNum')}", "", 1),
"speedKmh": int(opt["speed"]) if opt.get("speed") not in (None, "") else None,
"pitchType": opt.get("stuff"),
"runnerEvents": [],
})
continue
if ot == 14:
if seg_pitches:
seg_pitches[-1]["runnerEvents"].append(_parse_runner_event(txt))
else:
seg_runner_events.append(_parse_runner_event(txt))
continue
if ot == 24:
seg_runner_events.append(_parse_runner_event(txt))
continue
seg_event_texts.append(txt)
if "비디오 판독" in txt or "합의 판정" in txt:
seg_review_events.append(parse_review_event_text(txt))
elif "체크스윙" in txt:
seg_extra_events.append({"type": "appeal_or_judgement", "text": txt})
elif any(r in txt for r in ["1루주자", "2루주자", "3루주자", "대주자", "도루", "홈인", "포스아웃"]) or ("진루" in txt and "출루" not in txt):
seg_runner_events.append(_parse_runner_event(txt))
else:
seg_result_text = txt
if " : " in txt and seg_batter_name is None:
name_part = txt.split(" : ", 1)[0].strip()
if name_part and len(name_part) < 10:
seg_batter_name = name_part
if not seg_batter_name:
seg_batter_name = relay_batter_title if i == 0 else ""
# 주루 이벤트 병합
for p in seg_pitches:
p["runnerEvents"] = _merge_runner_events(p["runnerEvents"])
seg_merged_runners = _merge_runner_events(seg_runner_events)
# 타자 결과 객체
res_obj = None
if seg_result_text:
base_type = _classify_result_type(seg_result_text)
res_obj = {"type": base_type, "text": seg_result_text}
b_name = seg_batter_name.split()[-1] if seg_batter_name else ""
final_runners = []
for r in seg_merged_runners:
if b_name and r.get("runner") == b_name:
if base_type in {"single", "double", "triple"}:
r_type = r.get("type", "")
if r_type in {"tag_out", "force_out", "steal_fail", "pickoff_out"}:
res_obj["type"] = f"{base_type}_runner_out"
elif r_type == "error_advance":
res_obj["type"] = f"{base_type}_error_advance"
if r.get("toBase"):
res_obj["toBase"] = r["toBase"]
if r.get("extra_advance"):
res_obj["extra_advance"] = r["extra_advance"]
else:
final_runners.append(r)
seg_merged_runners = final_runners
if seg_changes:
results.extend(seg_changes)
if seg_event_texts:
full_txt = (
f"{seg_batter_name} : " + ", ".join(seg_event_texts)
if seg_batter_name
else ", ".join(seg_event_texts)
)
results.append({
"event_type": "at_bat",
"batter": seg_batter_name,
"rawText": full_txt,
"pitches": seg_pitches,
"result": res_obj,
"runnerEvents": seg_merged_runners,
"reviewEvents": seg_review_events,
"extraEvents": seg_extra_events,
"changes": [],
})
return results
# ──────────────────────────────────────────────
# 이닝 빌드
# ──────────────────────────────────────────────
def build_half_inning(
inning: int, home_or_away: int, relays: list[dict[str, Any]],
) -> dict[str, Any]:
"""한 이닝의 한 쪽(초/말) 데이터를 구성"""
title = get_half_inning_title(relays, inning, home_or_away)
raw_events: list[dict[str, Any]] = []
for relay in sorted(relays, key=_relay_seqno):
raw_events.extend(build_relay_events(relay))
# 같은 타자의 연속 타석 병합
merged_events: list[dict[str, Any]] = []
for event in raw_events:
if not merged_events or event.get("event_type") != "at_bat":
merged_events.append(event)
continue
prev = merged_events[-1]
if prev.get("event_type") != "at_bat":
merged_events.append(event)
continue
current_pitches = event.get("pitches") or []
first_pitch_no = current_pitches[0].get("pitchNo", 0) if current_pitches else 0
is_same_batter = prev.get("batter") == event.get("batter")
if first_pitch_no > 1 or is_same_batter:
prev["pitches"].extend(current_pitches)
if event.get("result"):
prev["result"] = event["result"]
if event.get("rawText"):
current_txt = event["rawText"]
if " : " in current_txt:
current_txt = current_txt.split(" : ", 1)[1]
prev["rawText"] += " / " + current_txt
prev["runnerEvents"].extend(event.get("runnerEvents") or [])
prev["reviewEvents"].extend(event.get("reviewEvents") or [])
prev["extraEvents"].extend(event.get("extraEvents") or [])
continue
merged_events.append(event)
return {
"inning": inning,
"half": "top" if home_or_away == 0 else "bottom",
"title": title,
"events": merged_events,
}
def parse_inning_value(val: Any, default: float) -> float:
"""이닝 인수 파싱 ('1T' → 1.0, '3B' → 3.5)"""
if val is None:
return default
s = str(val).upper().strip()
if not s:
return default
m = re.match(r"^(\d+)([TB]?)$", s)
if not m:
try:
return float(s)
except ValueError:
return default
num = int(m.group(1))
suffix = m.group(2)
if suffix == "T":
return float(num)
if suffix == "B":
return num + 0.5
return float(num)

270
crawler/report_builder.py Normal file
View File

@@ -0,0 +1,270 @@
"""
crawler/report_builder.py — 최종 JSON 리포트 생성
네이버 API 데이터를 수집하고, relay 파싱 결과를 합쳐서
정규화된 게임 리포트 JSON을 생성/저장합니다.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
from typing import Any
from core.config_loader import max_inning
from crawler.naver_api import (
NaverApiClient,
build_iso_datetime,
clean_game_id,
derive_umpires,
extract_pitching_summary,
get_team_names,
infer_game_type,
)
from crawler.relay_parser import build_half_inning, parse_inning_value
from crawler.lineup_builder import build_lineup_summary
# ──────────────────────────────────────────────
# 이닝 데이터 수집
# ──────────────────────────────────────────────
def collect_inning_data(
api: NaverApiClient,
game_id: str,
start_inning_val: str | None = None,
end_inning_val: str | None = None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
"""모든 이닝 relay 데이터를 수집하여 구조화"""
innings: list[dict[str, Any]] = []
raw_relays: list[dict[str, Any]] = []
start_score = parse_inning_value(start_inning_val, 0.0)
end_score = parse_inning_value(end_inning_val, 99.0)
for inning in range(1, max_inning() + 1):
try:
relay_data = api.fetch_relay(game_id, inning=inning)
except Exception:
break
relays = relay_data.get("textRelays", [])
if not relays:
break
grouped: dict[int, list[dict[str, Any]]] = defaultdict(list)
for relay in relays:
grouped[int(relay.get("homeOrAway", -1))].append(relay)
raw_relays.append(relay)
for home_or_away in (0, 1):
half_relays = grouped.get(home_or_away, [])
if not half_relays:
continue
current_score = inning + (0.5 if home_or_away == 1 else 0.0)
if current_score < start_score or current_score > end_score:
continue
innings.append(build_half_inning(inning, home_or_away, half_relays))
return innings, raw_relays
# ──────────────────────────────────────────────
# 점수 타임라인 & 블론세이브
# ──────────────────────────────────────────────
def _collect_score_timeline(raw_relays: list[dict[str, Any]]) -> list[dict[str, Any]]:
timeline: list[dict[str, Any]] = []
for relay in raw_relays:
for option in relay.get("textOptions", []):
state = option.get("currentGameState") or {}
if not state:
continue
timeline.append({
"seqno": option.get("seqno"),
"home_score": int(state.get("homeScore", 0)),
"away_score": int(state.get("awayScore", 0)),
})
timeline.sort(key=lambda item: item["seqno"])
return timeline
def _collect_blown_saves(
raw_relays: list[dict[str, Any]], away_name: str, home_name: str,
) -> list[str]:
timeline = _collect_score_timeline(raw_relays)
blown_save_pitchers: list[str] = []
pitcher_entries: list[dict[str, Any]] = []
for relay in raw_relays:
inning = int(relay.get("inn", 0) or 0)
if inning < 7:
continue
batting_side = int(relay.get("homeOrAway", -1))
pitcher_team = "home" if batting_side == 0 else "away"
pitcher_team_name = home_name if pitcher_team == "home" else away_name
for option in relay.get("textOptions", []):
if option.get("type") != 2:
continue
player_change = option.get("playerChange") or {}
in_player = player_change.get("inPlayer") or {}
if in_player.get("playerPos") != "투수":
continue
state = option.get("currentGameState") or {}
pitcher_entries.append({
"name": in_player.get("playerName"),
"team": pitcher_team,
"team_name": pitcher_team_name,
"entry_seqno": option.get("seqno"),
"home_score": int(state.get("homeScore", 0)),
"away_score": int(state.get("awayScore", 0)),
})
for entry in pitcher_entries:
team_score = entry["home_score"] if entry["team"] == "home" else entry["away_score"]
opp_score = entry["away_score"] if entry["team"] == "home" else entry["home_score"]
if team_score <= opp_score:
continue
for state in timeline:
if state["seqno"] <= entry["entry_seqno"]:
continue
current_team = state["home_score"] if entry["team"] == "home" else state["away_score"]
current_opp = state["away_score"] if entry["team"] == "home" else state["home_score"]
if current_team <= current_opp:
blown_save_pitchers.append(entry["name"])
break
return sorted(set(blown_save_pitchers))
# ──────────────────────────────────────────────
# 게임 정보 빌드
# ──────────────────────────────────────────────
def _build_game_info(
game_info: dict[str, Any],
record_data: dict[str, Any],
review_meta: dict[str, Any],
) -> dict[str, Any]:
end_time = build_iso_datetime(game_info.get("gameDate"), review_meta.get("END_TM"))
return {
"date": game_info.get("gameDate"),
"stadium": game_info.get("stadium"),
"start_time": game_info.get("gameDateTime"),
"end_time": end_time,
"season": game_info.get("seasonYear"),
"game_type": infer_game_type(game_info),
"home_team": game_info.get("homeTeamName"),
"away_team": game_info.get("awayTeamName"),
"attendance": review_meta.get("CROWD_CN"),
"umpires": derive_umpires(record_data),
}
def _build_pitcher_section(
record_data: dict[str, Any],
raw_relays: list[dict[str, Any]],
away_name: str,
home_name: str,
) -> dict[str, list[str]]:
summary = extract_pitching_summary(record_data)
summary["블론세이브"] = _collect_blown_saves(raw_relays, away_name, home_name)
return summary
# ──────────────────────────────────────────────
# 리포트 빌드 & 저장
# ──────────────────────────────────────────────
def build_report(
game_id: str,
start_inning: str | None = None,
end_inning: str | None = None,
) -> dict[str, Any]:
"""게임 ID로 전체 리포트 생성
네이버 API 4종 + KBO 메타를 수집하여 정규화된 JSON dict 반환.
"""
game_id = clean_game_id(game_id)
with NaverApiClient() as api:
relay_data = api.fetch_relay(game_id)
record_data = api.fetch_record(game_id)
game_info = api.fetch_game_info(game_id)
preview_data = api.fetch_preview(game_id)
review_meta = api.fetch_kbo_review_meta(game_id, game_info)
lineup_summary = build_lineup_summary(game_id, game_info, relay_data, preview_data)
innings, raw_relays = collect_inning_data(
api, game_id,
start_inning_val=start_inning,
end_inning_val=end_inning,
)
pitcher_section = _build_pitcher_section(
record_data, raw_relays,
lineup_summary["away_team"]["team_name"],
lineup_summary["home_team"]["team_name"],
)
return {
"game_id": game_id,
"game_info": _build_game_info(game_info, record_data, review_meta),
"lineups": lineup_summary,
"game_contents": innings,
"pitching_summary": pitcher_section,
}
def filter_report(
report: dict[str, Any],
inning: str | None = None,
lineup_only: bool = False,
start_inning: str | None = None,
end_inning: str | None = None,
) -> dict[str, Any]:
"""리포트에서 특정 이닝만 필터링"""
filtered = json.loads(json.dumps(report, ensure_ascii=False))
if lineup_only:
filtered["game_contents"] = []
filtered["pitching_summary"] = {
"승리투수": [], "패전투수": [], "홀드": [], "세이브": [], "블론세이브": [],
}
return filtered
start_v = parse_inning_value(start_inning, 0.0)
end_v = parse_inning_value(end_inning, 99.0)
if inning is not None:
iv = parse_inning_value(inning, 0.0)
start_v = iv
end_v = iv + 0.5
filtered["game_contents"] = [
half
for half in filtered.get("game_contents", [])
if start_v <= (
float(half.get("inning") or 0)
+ (0.5 if half.get("half") == "bottom" else 0.0)
) <= end_v
]
return filtered
def save_report(
report: dict[str, Any],
output_dir: Path,
output_json: Path | None = None,
) -> Path:
"""리포트를 JSON 파일로 저장"""
output_dir.mkdir(parents=True, exist_ok=True)
game_id = report["game_id"]
json_path = output_json or (output_dir / f"{game_id}_report.json")
json_path.parent.mkdir(parents=True, exist_ok=True)
json_path.write_text(
json.dumps(report, ensure_ascii=False, indent=2),
encoding="utf-8",
)
return json_path