Files
baseball-automation/crawler/relay_parser.py
2026-05-02 16:24:42 +09:00

536 lines
20 KiB
Python

"""
crawler/relay_parser.py — relay 데이터 파싱
네이버 textRelays를 분석하여 이닝별/타석별 구조화된 이벤트로 변환합니다.
"""
from __future__ import annotations
import re
from collections import defaultdict
from typing import Any
from core.config_loader import (
skip_option_types,
hidden_event_texts,
change_keywords,
max_inning,
)
from core.review_parser import parse_review_event_text
# ──────────────────────────────────────────────
# 정렬 키
# ──────────────────────────────────────────────
def _option_seqno(option: dict[str, Any]) -> int:
return int(option.get("seqno", -1))
def _relay_seqno(relay: dict[str, Any]) -> int:
seqnos = [
_option_seqno(opt)
for opt in relay.get("textOptions", [])
if opt.get("seqno") is not None
]
return min(seqnos) if seqnos else -1
# ──────────────────────────────────────────────
# 제목 추출
# ──────────────────────────────────────────────
def get_half_inning_title(
relays: list[dict[str, Any]], inning: int, home_or_away: int,
) -> str:
"""이닝 시작 릴레이에서 제목 추출"""
for relay in relays:
for opt in relay.get("textOptions", []):
if opt.get("type") == 0:
return opt.get("text", "").strip()
half_label = "" if home_or_away == 0 else ""
return f"{inning}{half_label}"
def _get_batter_title(relay: dict[str, Any], options: list[dict[str, Any]]) -> str:
"""릴레이 블록에서 타자 이름/제목 추출"""
batter_title = next(
(opt.get("text", "").strip() for opt in options if opt.get("type") == 8),
"",
)
if batter_title:
return batter_title
title = (relay.get("title") or "").strip()
if title and "공격" not in title and not title.startswith("="):
return title
return ""
# ──────────────────────────────────────────────
# 투구/주루/교체 파싱
# ──────────────────────────────────────────────
def _format_pitch_text(option: dict[str, Any]) -> str:
"""투구 옵션 → 포맷된 텍스트"""
text = option.get("text", "").strip()
speed = str(option.get("speed") or "").strip()
stuff = str(option.get("stuff") or "").strip()
details = []
if speed:
details.append(f"{speed}km")
if stuff:
details.append(stuff)
return f"{text} ({', '.join(details)})" if details else text
def _classify_pitch_result(text: str, code: str | None) -> str:
"""투구 결과 텍스트 + 코드 → 정규화된 결과 코드"""
normalized = text.replace(" ", "")
if any(key in normalized for key in ("번트헛스윙", "헛스윙번트", "번트시도스트라이크")):
return "BS"
if any(key in normalized for key in ("번트파울", "번트파울.")):
return "BF"
if code in {"BS", "BF", "B", "T", "S", "F", "H"}:
return code
if code and code != "V":
return code
mapping = {
"번트 헛스윙": "BS",
"번트헛스윙": "BS",
"번트 파울": "BF",
"번트파울": "BF",
"": "B",
"스트라이크": "T",
"헛스윙": "S",
"파울": "F",
"타격": "H",
}
for key, value in mapping.items():
if key in text:
return value
return ""
def _classify_result_type(text: str) -> str:
"""결과 텍스트 → result.type 코드"""
clean_text = text.replace(" ", "")
if "낫아웃" in clean_text:
return "strikeout_not_out"
if "고의사구" in text:
return "intentional_walk"
if "볼넷" in text:
return "walk"
if "삼진" in text:
return "strikeout"
if any(k in text for k in ["몸에 맞는 볼", "몸에 맞는 공", "사구", "헤드샷"]):
return "hit_by_pitch"
if "홈런" in text:
return "home_run"
if "3루타" in text:
return "triple"
if "2루타" in text:
return "double"
if "번트안타" in text:
return "bunt_hit"
if "1루타" in text or "내야안타" in text:
return "single"
if "실책" in text and "출루" in text:
return "reach_on_error"
if "야수선택" in text:
return "reach_on_fielder_choice"
if "땅볼로 출루" in text or "땅볼출루" in text:
return "reach_on_grounder"
if "희생번트" in text:
return "sacrifice_bunt"
if "희생플라이" in text:
return "sacrifice_fly"
if "병살타" in text:
return "double_play"
if any(k in text for k in [
"플라이 아웃", "땅볼 아웃", "인필드플라이 아웃",
"라인드라이브 아웃", "직선타 아웃", "라인드라이브", "직선타",
]):
return "out"
return "play"
def _parse_runner_event(text: str) -> dict[str, Any]:
"""주루 이벤트 텍스트 → 구조화된 dict"""
event_type = "runner_event"
if "도루" in text:
event_type = "steal_fail" if "실패" in text else "steal"
elif "홈인" in text:
event_type = "score"
elif "포스아웃" in text:
event_type = "force_out"
elif "견제사" in text:
event_type = "pickoff_out"
elif "태그아웃" in text:
event_type = "tag_out"
elif "실책" in text:
event_type = "error_advance"
elif "폭투" in text:
event_type = "wild_pitch_advance"
elif "포일" in text:
event_type = "passed_ball_advance"
elif "진루" in text:
event_type = "advance"
from_base = None
to_base = None
for label, base in (("1루주자", 1), ("2루주자", 2), ("3루주자", 3), ("1루", 1), ("2루", 2), ("3루", 3)):
if label in text and from_base is None:
from_base = base
for label, base in (("1루까지", 1), ("2루까지", 2), ("3루까지", 3)):
if label in text:
to_base = base
if "홈인" in text:
to_base = 4
runner_name = (
text.split(" : ", 1)[0]
.replace("1루주자 ", "")
.replace("2루주자 ", "")
.replace("3루주자 ", "")
.replace("대주자 ", "")
.strip()
)
extra_advance = 0
if "주자의 재치로" in text and from_base is not None and to_base is not None:
extra_advance = max(0, to_base - from_base)
# action_label: 관리자 사이트 버튼 라벨 매핑
clean_text = text.replace(" ", "")
if "실책으로" in clean_text:
action_label = "수비 실책"
elif "도루" in clean_text:
action_label = "도루성공" if "실패" not in clean_text else "도루시도 아웃"
elif "폭투" in clean_text:
action_label = "폭투-진루성공"
elif "포일" in clean_text:
action_label = "포일-진루성공"
elif "태그" in clean_text:
action_label = "태그아웃"
elif "포스" in clean_text:
action_label = "포스아웃"
elif "견제" in clean_text:
action_label = "견제 아웃"
elif any(k in clean_text for k in ["볼넷", "포볼", "고의사구", "몸에맞는", "사구"]):
action_label = "볼넷 진루"
else:
action_label = "일반 진루"
return {
"type": event_type,
"runner": runner_name,
"fromBase": from_base,
"toBase": to_base,
"extra_advance": extra_advance,
"text": text,
"action_label": action_label,
}
def _parse_change_event(text: str) -> dict[str, Any]:
"""교체 텍스트 → 구조화된 dict"""
event: dict[str, Any] = {
"event_type": "change",
"change_type": "position_change" if "수비위치 변경" in text else "substitution",
"text": text,
}
actor_role, batter_order, actor_name = _extract_change_actor(text)
event["actor_role"] = actor_role
event["actor_name"] = actor_name
if batter_order:
event["bat_order"] = int(batter_order)
if "수비위치 변경" in text:
to_position = text.split(" : ", 1)[1].split("(으)로", 1)[0].strip()
event["player_name"] = actor_name
event["to_position"] = to_position
return event
rhs = text.split(" : ", 1)[1].split("(으)로 교체", 1)[0].strip()
in_role, _, in_name = _extract_change_actor(rhs)
event["out_player"] = actor_name
event["in_player"] = in_name
event["in_role"] = in_role
field_roles = {"투수", "포수", "1루수", "2루수", "3루수", "유격수", "좌익수", "중견수", "우익수"}
if actor_role in field_roles and in_role == "투수":
event["change_type"] = "merged_pitcher_substitution"
event["player_name"] = actor_name
event["to_position"] = "지명타자"
event["pitcher_in_player"] = in_name
return event
extra_roles = field_roles | {"대타", "대주자"}
if in_role in extra_roles:
event["to_position"] = in_role if in_role not in {"대타", "대주자"} else None
return event
def _extract_change_actor(text: str) -> tuple[str | None, str | None, str]:
"""교체 텍스트에서 역할/타순/이름 추출"""
lhs = text.split(" : ", 1)[0].strip()
if "번타자 " in lhs:
order_match = re.search(r"(\d+)번타자\s+(.+)$", lhs)
if order_match:
return "batter", order_match.group(1), order_match.group(2).strip()
for role in (
"대타", "대주자", "1루주자", "2루주자", "3루주자", "주자",
"투수", "포수", "1루수", "2루수", "3루수",
"유격수", "좌익수", "중견수", "우익수",
):
if lhs.startswith(role + " "):
return role, None, lhs[len(role):].strip()
return None, None, lhs
# ──────────────────────────────────────────────
# 주루 이벤트 병합
# ──────────────────────────────────────────────
def _merge_runner_events(runner_events: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""동일 주자의 이벤트를 병합"""
merged: dict[str, dict[str, Any]] = {}
for r in runner_events:
name = r.get("runner")
if not name:
continue
if name in merged:
merged[name]["type"] = r.get("type", merged[name]["type"])
merged[name]["text"] += f" / {r.get('text', '')}"
if r.get("toBase"):
merged[name]["toBase"] = r["toBase"]
if r.get("extra_advance"):
merged[name]["extra_advance"] = r["extra_advance"]
if "태그아웃" in r.get("text", "") or r.get("type") == "tag_out":
merged[name]["type"] = "tag_out"
else:
merged[name] = dict(r)
return list(merged.values())
# ──────────────────────────────────────────────
# 릴레이 → 이벤트 리스트 변환
# ──────────────────────────────────────────────
def build_relay_events(relay: dict[str, Any]) -> list[dict[str, Any]]:
"""하나의 릴레이 블록 → 타석/교체 이벤트 리스트"""
skip_types = skip_option_types()
hidden_texts = hidden_event_texts()
chg_keywords = change_keywords()
options = sorted(relay.get("textOptions", []), key=_option_seqno)
# 1. 세그먼트 분리 (pitchNum 1이 새로 나오면 타자가 바뀐 것)
segments: list[list[dict[str, Any]]] = []
current_segment: list[dict[str, Any]] = []
for opt in options:
opt_type = opt.get("type")
if opt_type == 1 and opt.get("pitchNum") == 1:
if any(o.get("type") == 1 for o in current_segment):
segments.append(current_segment)
current_segment = []
current_segment.append(opt)
if current_segment:
segments.append(current_segment)
# 2. 각 세그먼트별 이벤트 생성
results: list[dict[str, Any]] = []
relay_batter_title = _get_batter_title(relay, options)
for i, seg_options in enumerate(segments):
seg_changes: list[dict[str, Any]] = []
seg_event_texts: list[str] = []
seg_pitches: list[dict[str, Any]] = []
seg_runner_events: list[dict[str, Any]] = []
seg_review_events: list[dict[str, Any]] = []
seg_extra_events: list[dict[str, Any]] = []
seg_result_text: str | None = None
seg_batter_name: str | None = next(
(o.get("text", "").strip() for o in seg_options if o.get("type") == 8),
None,
)
for opt in seg_options:
ot = opt.get("type")
txt = opt.get("text", "").strip()
if not txt or ot in skip_types:
continue
if txt in hidden_texts:
continue
if any(k in txt for k in chg_keywords):
seg_changes.append(_parse_change_event(txt))
continue
if ot == 1:
seg_event_texts.append(_format_pitch_text(opt))
seg_pitches.append({
"pitchNo": opt.get("pitchNum"),
"pitchResult": _classify_pitch_result(txt, opt.get("pitchResult")),
"pitchResultText": txt.replace(f"{opt.get('pitchNum')}", "", 1),
"speedKmh": int(opt["speed"]) if opt.get("speed") not in (None, "") else None,
"pitchType": opt.get("stuff"),
"runnerEvents": [],
})
continue
if ot == 14:
if seg_pitches:
seg_pitches[-1]["runnerEvents"].append(_parse_runner_event(txt))
else:
seg_runner_events.append(_parse_runner_event(txt))
continue
if ot == 24:
seg_runner_events.append(_parse_runner_event(txt))
continue
seg_event_texts.append(txt)
if "비디오 판독" in txt or "합의 판정" in txt:
seg_review_events.append(parse_review_event_text(txt))
elif "체크스윙" in txt:
seg_extra_events.append({"type": "appeal_or_judgement", "text": txt})
elif any(r in txt for r in ["1루주자", "2루주자", "3루주자", "대주자", "도루", "홈인", "포스아웃"]) or ("진루" in txt and "출루" not in txt):
seg_runner_events.append(_parse_runner_event(txt))
else:
seg_result_text = txt
if " : " in txt and seg_batter_name is None:
name_part = txt.split(" : ", 1)[0].strip()
if name_part and len(name_part) < 10:
seg_batter_name = name_part
if not seg_batter_name:
seg_batter_name = relay_batter_title if i == 0 else ""
# 주루 이벤트 병합
for p in seg_pitches:
p["runnerEvents"] = _merge_runner_events(p["runnerEvents"])
seg_merged_runners = _merge_runner_events(seg_runner_events)
# 타자 결과 객체
res_obj = None
if seg_result_text:
base_type = _classify_result_type(seg_result_text)
res_obj = {"type": base_type, "text": seg_result_text}
b_name = seg_batter_name.split()[-1] if seg_batter_name else ""
final_runners = []
for r in seg_merged_runners:
if b_name and r.get("runner") == b_name:
if base_type in {"single", "double", "triple"}:
r_type = r.get("type", "")
if r_type in {"tag_out", "force_out", "steal_fail", "pickoff_out"}:
res_obj["type"] = f"{base_type}_runner_out"
elif r_type == "error_advance":
res_obj["type"] = f"{base_type}_error_advance"
if r.get("toBase"):
res_obj["toBase"] = r["toBase"]
if r.get("extra_advance"):
res_obj["extra_advance"] = r["extra_advance"]
else:
final_runners.append(r)
seg_merged_runners = final_runners
if seg_changes:
results.extend(seg_changes)
if seg_event_texts:
full_txt = (
f"{seg_batter_name} : " + ", ".join(seg_event_texts)
if seg_batter_name
else ", ".join(seg_event_texts)
)
results.append({
"event_type": "at_bat",
"batter": seg_batter_name,
"rawText": full_txt,
"pitches": seg_pitches,
"result": res_obj,
"runnerEvents": seg_merged_runners,
"reviewEvents": seg_review_events,
"extraEvents": seg_extra_events,
"changes": [],
})
return results
# ──────────────────────────────────────────────
# 이닝 빌드
# ──────────────────────────────────────────────
def build_half_inning(
inning: int, home_or_away: int, relays: list[dict[str, Any]],
) -> dict[str, Any]:
"""한 이닝의 한 쪽(초/말) 데이터를 구성"""
title = get_half_inning_title(relays, inning, home_or_away)
raw_events: list[dict[str, Any]] = []
for relay in sorted(relays, key=_relay_seqno):
raw_events.extend(build_relay_events(relay))
# 같은 타자의 연속 타석 병합
merged_events: list[dict[str, Any]] = []
for event in raw_events:
if not merged_events or event.get("event_type") != "at_bat":
merged_events.append(event)
continue
prev = merged_events[-1]
if prev.get("event_type") != "at_bat":
merged_events.append(event)
continue
current_pitches = event.get("pitches") or []
first_pitch_no = current_pitches[0].get("pitchNo", 0) if current_pitches else 0
is_same_batter = prev.get("batter") == event.get("batter")
if first_pitch_no > 1 or is_same_batter:
prev["pitches"].extend(current_pitches)
if event.get("result"):
prev["result"] = event["result"]
if event.get("rawText"):
current_txt = event["rawText"]
if " : " in current_txt:
current_txt = current_txt.split(" : ", 1)[1]
prev["rawText"] += " / " + current_txt
prev["runnerEvents"].extend(event.get("runnerEvents") or [])
prev["reviewEvents"].extend(event.get("reviewEvents") or [])
prev["extraEvents"].extend(event.get("extraEvents") or [])
continue
merged_events.append(event)
return {
"inning": inning,
"half": "top" if home_or_away == 0 else "bottom",
"title": title,
"events": merged_events,
}
def parse_inning_value(val: Any, default: float) -> float:
"""이닝 인수 파싱 ('1T' → 1.0, '3B' → 3.5)"""
if val is None:
return default
s = str(val).upper().strip()
if not s:
return default
m = re.match(r"^(\d+)([TB]?)$", s)
if not m:
try:
return float(s)
except ValueError:
return default
num = int(m.group(1))
suffix = m.group(2)
if suffix == "T":
return float(num)
if suffix == "B":
return num + 0.5
return float(num)