536 lines
20 KiB
Python
536 lines
20 KiB
Python
"""
|
|
crawler/relay_parser.py — relay 데이터 파싱
|
|
|
|
네이버 textRelays를 분석하여 이닝별/타석별 구조화된 이벤트로 변환합니다.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
|
|
from core.config_loader import (
|
|
skip_option_types,
|
|
hidden_event_texts,
|
|
change_keywords,
|
|
max_inning,
|
|
)
|
|
from core.review_parser import parse_review_event_text
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 정렬 키
|
|
# ──────────────────────────────────────────────
|
|
|
|
def _option_seqno(option: dict[str, Any]) -> int:
|
|
return int(option.get("seqno", -1))
|
|
|
|
|
|
def _relay_seqno(relay: dict[str, Any]) -> int:
|
|
seqnos = [
|
|
_option_seqno(opt)
|
|
for opt in relay.get("textOptions", [])
|
|
if opt.get("seqno") is not None
|
|
]
|
|
return min(seqnos) if seqnos else -1
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 제목 추출
|
|
# ──────────────────────────────────────────────
|
|
|
|
def get_half_inning_title(
|
|
relays: list[dict[str, Any]], inning: int, home_or_away: int,
|
|
) -> str:
|
|
"""이닝 시작 릴레이에서 제목 추출"""
|
|
for relay in relays:
|
|
for opt in relay.get("textOptions", []):
|
|
if opt.get("type") == 0:
|
|
return opt.get("text", "").strip()
|
|
half_label = "초" if home_or_away == 0 else "말"
|
|
return f"{inning}회{half_label}"
|
|
|
|
|
|
def _get_batter_title(relay: dict[str, Any], options: list[dict[str, Any]]) -> str:
|
|
"""릴레이 블록에서 타자 이름/제목 추출"""
|
|
batter_title = next(
|
|
(opt.get("text", "").strip() for opt in options if opt.get("type") == 8),
|
|
"",
|
|
)
|
|
if batter_title:
|
|
return batter_title
|
|
title = (relay.get("title") or "").strip()
|
|
if title and "공격" not in title and not title.startswith("="):
|
|
return title
|
|
return ""
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 투구/주루/교체 파싱
|
|
# ──────────────────────────────────────────────
|
|
|
|
def _format_pitch_text(option: dict[str, Any]) -> str:
|
|
"""투구 옵션 → 포맷된 텍스트"""
|
|
text = option.get("text", "").strip()
|
|
speed = str(option.get("speed") or "").strip()
|
|
stuff = str(option.get("stuff") or "").strip()
|
|
details = []
|
|
if speed:
|
|
details.append(f"{speed}km")
|
|
if stuff:
|
|
details.append(stuff)
|
|
return f"{text} ({', '.join(details)})" if details else text
|
|
|
|
|
|
def _classify_pitch_result(text: str, code: str | None) -> str:
|
|
"""투구 결과 텍스트 + 코드 → 정규화된 결과 코드"""
|
|
normalized = text.replace(" ", "")
|
|
if any(key in normalized for key in ("번트헛스윙", "헛스윙번트", "번트시도스트라이크")):
|
|
return "BS"
|
|
if any(key in normalized for key in ("번트파울", "번트파울.")):
|
|
return "BF"
|
|
if code in {"BS", "BF", "B", "T", "S", "F", "H"}:
|
|
return code
|
|
if code and code != "V":
|
|
return code
|
|
mapping = {
|
|
"번트 헛스윙": "BS",
|
|
"번트헛스윙": "BS",
|
|
"번트 파울": "BF",
|
|
"번트파울": "BF",
|
|
"볼": "B",
|
|
"스트라이크": "T",
|
|
"헛스윙": "S",
|
|
"파울": "F",
|
|
"타격": "H",
|
|
}
|
|
for key, value in mapping.items():
|
|
if key in text:
|
|
return value
|
|
return ""
|
|
|
|
|
|
def _classify_result_type(text: str) -> str:
|
|
"""결과 텍스트 → result.type 코드"""
|
|
clean_text = text.replace(" ", "")
|
|
if "낫아웃" in clean_text:
|
|
return "strikeout_not_out"
|
|
if "고의사구" in text:
|
|
return "intentional_walk"
|
|
if "볼넷" in text:
|
|
return "walk"
|
|
if "삼진" in text:
|
|
return "strikeout"
|
|
if any(k in text for k in ["몸에 맞는 볼", "몸에 맞는 공", "사구", "헤드샷"]):
|
|
return "hit_by_pitch"
|
|
if "홈런" in text:
|
|
return "home_run"
|
|
if "3루타" in text:
|
|
return "triple"
|
|
if "2루타" in text:
|
|
return "double"
|
|
if "번트안타" in text:
|
|
return "bunt_hit"
|
|
if "1루타" in text or "내야안타" in text:
|
|
return "single"
|
|
if "실책" in text and "출루" in text:
|
|
return "reach_on_error"
|
|
if "야수선택" in text:
|
|
return "reach_on_fielder_choice"
|
|
if "땅볼로 출루" in text or "땅볼출루" in text:
|
|
return "reach_on_grounder"
|
|
if "희생번트" in text:
|
|
return "sacrifice_bunt"
|
|
if "희생플라이" in text:
|
|
return "sacrifice_fly"
|
|
if "병살타" in text:
|
|
return "double_play"
|
|
if any(k in text for k in [
|
|
"플라이 아웃", "땅볼 아웃", "인필드플라이 아웃",
|
|
"라인드라이브 아웃", "직선타 아웃", "라인드라이브", "직선타",
|
|
]):
|
|
return "out"
|
|
return "play"
|
|
|
|
|
|
def _parse_runner_event(text: str) -> dict[str, Any]:
|
|
"""주루 이벤트 텍스트 → 구조화된 dict"""
|
|
event_type = "runner_event"
|
|
if "도루" in text:
|
|
event_type = "steal_fail" if "실패" in text else "steal"
|
|
elif "홈인" in text:
|
|
event_type = "score"
|
|
elif "포스아웃" in text:
|
|
event_type = "force_out"
|
|
elif "견제사" in text:
|
|
event_type = "pickoff_out"
|
|
elif "태그아웃" in text:
|
|
event_type = "tag_out"
|
|
elif "실책" in text:
|
|
event_type = "error_advance"
|
|
elif "폭투" in text:
|
|
event_type = "wild_pitch_advance"
|
|
elif "포일" in text:
|
|
event_type = "passed_ball_advance"
|
|
elif "진루" in text:
|
|
event_type = "advance"
|
|
|
|
from_base = None
|
|
to_base = None
|
|
for label, base in (("1루주자", 1), ("2루주자", 2), ("3루주자", 3), ("1루", 1), ("2루", 2), ("3루", 3)):
|
|
if label in text and from_base is None:
|
|
from_base = base
|
|
for label, base in (("1루까지", 1), ("2루까지", 2), ("3루까지", 3)):
|
|
if label in text:
|
|
to_base = base
|
|
if "홈인" in text:
|
|
to_base = 4
|
|
|
|
runner_name = (
|
|
text.split(" : ", 1)[0]
|
|
.replace("1루주자 ", "")
|
|
.replace("2루주자 ", "")
|
|
.replace("3루주자 ", "")
|
|
.replace("대주자 ", "")
|
|
.strip()
|
|
)
|
|
|
|
extra_advance = 0
|
|
if "주자의 재치로" in text and from_base is not None and to_base is not None:
|
|
extra_advance = max(0, to_base - from_base)
|
|
|
|
# action_label: 관리자 사이트 버튼 라벨 매핑
|
|
clean_text = text.replace(" ", "")
|
|
if "실책으로" in clean_text:
|
|
action_label = "수비 실책"
|
|
elif "도루" in clean_text:
|
|
action_label = "도루성공" if "실패" not in clean_text else "도루시도 아웃"
|
|
elif "폭투" in clean_text:
|
|
action_label = "폭투-진루성공"
|
|
elif "포일" in clean_text:
|
|
action_label = "포일-진루성공"
|
|
elif "태그" in clean_text:
|
|
action_label = "태그아웃"
|
|
elif "포스" in clean_text:
|
|
action_label = "포스아웃"
|
|
elif "견제" in clean_text:
|
|
action_label = "견제 아웃"
|
|
elif any(k in clean_text for k in ["볼넷", "포볼", "고의사구", "몸에맞는", "사구"]):
|
|
action_label = "볼넷 진루"
|
|
else:
|
|
action_label = "일반 진루"
|
|
|
|
return {
|
|
"type": event_type,
|
|
"runner": runner_name,
|
|
"fromBase": from_base,
|
|
"toBase": to_base,
|
|
"extra_advance": extra_advance,
|
|
"text": text,
|
|
"action_label": action_label,
|
|
}
|
|
|
|
|
|
def _parse_change_event(text: str) -> dict[str, Any]:
|
|
"""교체 텍스트 → 구조화된 dict"""
|
|
event: dict[str, Any] = {
|
|
"event_type": "change",
|
|
"change_type": "position_change" if "수비위치 변경" in text else "substitution",
|
|
"text": text,
|
|
}
|
|
actor_role, batter_order, actor_name = _extract_change_actor(text)
|
|
event["actor_role"] = actor_role
|
|
event["actor_name"] = actor_name
|
|
if batter_order:
|
|
event["bat_order"] = int(batter_order)
|
|
|
|
if "수비위치 변경" in text:
|
|
to_position = text.split(" : ", 1)[1].split("(으)로", 1)[0].strip()
|
|
event["player_name"] = actor_name
|
|
event["to_position"] = to_position
|
|
return event
|
|
|
|
rhs = text.split(" : ", 1)[1].split("(으)로 교체", 1)[0].strip()
|
|
in_role, _, in_name = _extract_change_actor(rhs)
|
|
event["out_player"] = actor_name
|
|
event["in_player"] = in_name
|
|
event["in_role"] = in_role
|
|
|
|
field_roles = {"투수", "포수", "1루수", "2루수", "3루수", "유격수", "좌익수", "중견수", "우익수"}
|
|
if actor_role in field_roles and in_role == "투수":
|
|
event["change_type"] = "merged_pitcher_substitution"
|
|
event["player_name"] = actor_name
|
|
event["to_position"] = "지명타자"
|
|
event["pitcher_in_player"] = in_name
|
|
return event
|
|
|
|
extra_roles = field_roles | {"대타", "대주자"}
|
|
if in_role in extra_roles:
|
|
event["to_position"] = in_role if in_role not in {"대타", "대주자"} else None
|
|
return event
|
|
|
|
|
|
def _extract_change_actor(text: str) -> tuple[str | None, str | None, str]:
|
|
"""교체 텍스트에서 역할/타순/이름 추출"""
|
|
lhs = text.split(" : ", 1)[0].strip()
|
|
if "번타자 " in lhs:
|
|
order_match = re.search(r"(\d+)번타자\s+(.+)$", lhs)
|
|
if order_match:
|
|
return "batter", order_match.group(1), order_match.group(2).strip()
|
|
for role in (
|
|
"대타", "대주자", "1루주자", "2루주자", "3루주자", "주자",
|
|
"투수", "포수", "1루수", "2루수", "3루수",
|
|
"유격수", "좌익수", "중견수", "우익수",
|
|
):
|
|
if lhs.startswith(role + " "):
|
|
return role, None, lhs[len(role):].strip()
|
|
return None, None, lhs
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 주루 이벤트 병합
|
|
# ──────────────────────────────────────────────
|
|
|
|
def _merge_runner_events(runner_events: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""동일 주자의 이벤트를 병합"""
|
|
merged: dict[str, dict[str, Any]] = {}
|
|
for r in runner_events:
|
|
name = r.get("runner")
|
|
if not name:
|
|
continue
|
|
if name in merged:
|
|
merged[name]["type"] = r.get("type", merged[name]["type"])
|
|
merged[name]["text"] += f" / {r.get('text', '')}"
|
|
if r.get("toBase"):
|
|
merged[name]["toBase"] = r["toBase"]
|
|
if r.get("extra_advance"):
|
|
merged[name]["extra_advance"] = r["extra_advance"]
|
|
if "태그아웃" in r.get("text", "") or r.get("type") == "tag_out":
|
|
merged[name]["type"] = "tag_out"
|
|
else:
|
|
merged[name] = dict(r)
|
|
return list(merged.values())
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 릴레이 → 이벤트 리스트 변환
|
|
# ──────────────────────────────────────────────
|
|
|
|
def build_relay_events(relay: dict[str, Any]) -> list[dict[str, Any]]:
|
|
"""하나의 릴레이 블록 → 타석/교체 이벤트 리스트"""
|
|
skip_types = skip_option_types()
|
|
hidden_texts = hidden_event_texts()
|
|
chg_keywords = change_keywords()
|
|
|
|
options = sorted(relay.get("textOptions", []), key=_option_seqno)
|
|
|
|
# 1. 세그먼트 분리 (pitchNum 1이 새로 나오면 타자가 바뀐 것)
|
|
segments: list[list[dict[str, Any]]] = []
|
|
current_segment: list[dict[str, Any]] = []
|
|
|
|
for opt in options:
|
|
opt_type = opt.get("type")
|
|
if opt_type == 1 and opt.get("pitchNum") == 1:
|
|
if any(o.get("type") == 1 for o in current_segment):
|
|
segments.append(current_segment)
|
|
current_segment = []
|
|
current_segment.append(opt)
|
|
if current_segment:
|
|
segments.append(current_segment)
|
|
|
|
# 2. 각 세그먼트별 이벤트 생성
|
|
results: list[dict[str, Any]] = []
|
|
relay_batter_title = _get_batter_title(relay, options)
|
|
|
|
for i, seg_options in enumerate(segments):
|
|
seg_changes: list[dict[str, Any]] = []
|
|
seg_event_texts: list[str] = []
|
|
seg_pitches: list[dict[str, Any]] = []
|
|
seg_runner_events: list[dict[str, Any]] = []
|
|
seg_review_events: list[dict[str, Any]] = []
|
|
seg_extra_events: list[dict[str, Any]] = []
|
|
seg_result_text: str | None = None
|
|
|
|
seg_batter_name: str | None = next(
|
|
(o.get("text", "").strip() for o in seg_options if o.get("type") == 8),
|
|
None,
|
|
)
|
|
|
|
for opt in seg_options:
|
|
ot = opt.get("type")
|
|
txt = opt.get("text", "").strip()
|
|
if not txt or ot in skip_types:
|
|
continue
|
|
if txt in hidden_texts:
|
|
continue
|
|
if any(k in txt for k in chg_keywords):
|
|
seg_changes.append(_parse_change_event(txt))
|
|
continue
|
|
|
|
if ot == 1:
|
|
seg_event_texts.append(_format_pitch_text(opt))
|
|
seg_pitches.append({
|
|
"pitchNo": opt.get("pitchNum"),
|
|
"pitchResult": _classify_pitch_result(txt, opt.get("pitchResult")),
|
|
"pitchResultText": txt.replace(f"{opt.get('pitchNum')}구 ", "", 1),
|
|
"speedKmh": int(opt["speed"]) if opt.get("speed") not in (None, "") else None,
|
|
"pitchType": opt.get("stuff"),
|
|
"runnerEvents": [],
|
|
})
|
|
continue
|
|
|
|
if ot == 14:
|
|
if seg_pitches:
|
|
seg_pitches[-1]["runnerEvents"].append(_parse_runner_event(txt))
|
|
else:
|
|
seg_runner_events.append(_parse_runner_event(txt))
|
|
continue
|
|
if ot == 24:
|
|
seg_runner_events.append(_parse_runner_event(txt))
|
|
continue
|
|
|
|
seg_event_texts.append(txt)
|
|
if "비디오 판독" in txt or "합의 판정" in txt:
|
|
seg_review_events.append(parse_review_event_text(txt))
|
|
elif "체크스윙" in txt:
|
|
seg_extra_events.append({"type": "appeal_or_judgement", "text": txt})
|
|
elif any(r in txt for r in ["1루주자", "2루주자", "3루주자", "대주자", "도루", "홈인", "포스아웃"]) or ("진루" in txt and "출루" not in txt):
|
|
seg_runner_events.append(_parse_runner_event(txt))
|
|
else:
|
|
seg_result_text = txt
|
|
if " : " in txt and seg_batter_name is None:
|
|
name_part = txt.split(" : ", 1)[0].strip()
|
|
if name_part and len(name_part) < 10:
|
|
seg_batter_name = name_part
|
|
|
|
if not seg_batter_name:
|
|
seg_batter_name = relay_batter_title if i == 0 else ""
|
|
|
|
# 주루 이벤트 병합
|
|
for p in seg_pitches:
|
|
p["runnerEvents"] = _merge_runner_events(p["runnerEvents"])
|
|
seg_merged_runners = _merge_runner_events(seg_runner_events)
|
|
|
|
# 타자 결과 객체
|
|
res_obj = None
|
|
if seg_result_text:
|
|
base_type = _classify_result_type(seg_result_text)
|
|
res_obj = {"type": base_type, "text": seg_result_text}
|
|
|
|
b_name = seg_batter_name.split()[-1] if seg_batter_name else ""
|
|
final_runners = []
|
|
for r in seg_merged_runners:
|
|
if b_name and r.get("runner") == b_name:
|
|
if base_type in {"single", "double", "triple"}:
|
|
r_type = r.get("type", "")
|
|
if r_type in {"tag_out", "force_out", "steal_fail", "pickoff_out"}:
|
|
res_obj["type"] = f"{base_type}_runner_out"
|
|
elif r_type == "error_advance":
|
|
res_obj["type"] = f"{base_type}_error_advance"
|
|
if r.get("toBase"):
|
|
res_obj["toBase"] = r["toBase"]
|
|
if r.get("extra_advance"):
|
|
res_obj["extra_advance"] = r["extra_advance"]
|
|
else:
|
|
final_runners.append(r)
|
|
seg_merged_runners = final_runners
|
|
|
|
if seg_changes:
|
|
results.extend(seg_changes)
|
|
|
|
if seg_event_texts:
|
|
full_txt = (
|
|
f"{seg_batter_name} : " + ", ".join(seg_event_texts)
|
|
if seg_batter_name
|
|
else ", ".join(seg_event_texts)
|
|
)
|
|
results.append({
|
|
"event_type": "at_bat",
|
|
"batter": seg_batter_name,
|
|
"rawText": full_txt,
|
|
"pitches": seg_pitches,
|
|
"result": res_obj,
|
|
"runnerEvents": seg_merged_runners,
|
|
"reviewEvents": seg_review_events,
|
|
"extraEvents": seg_extra_events,
|
|
"changes": [],
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 이닝 빌드
|
|
# ──────────────────────────────────────────────
|
|
|
|
def build_half_inning(
|
|
inning: int, home_or_away: int, relays: list[dict[str, Any]],
|
|
) -> dict[str, Any]:
|
|
"""한 이닝의 한 쪽(초/말) 데이터를 구성"""
|
|
title = get_half_inning_title(relays, inning, home_or_away)
|
|
raw_events: list[dict[str, Any]] = []
|
|
|
|
for relay in sorted(relays, key=_relay_seqno):
|
|
raw_events.extend(build_relay_events(relay))
|
|
|
|
# 같은 타자의 연속 타석 병합
|
|
merged_events: list[dict[str, Any]] = []
|
|
for event in raw_events:
|
|
if not merged_events or event.get("event_type") != "at_bat":
|
|
merged_events.append(event)
|
|
continue
|
|
|
|
prev = merged_events[-1]
|
|
if prev.get("event_type") != "at_bat":
|
|
merged_events.append(event)
|
|
continue
|
|
|
|
current_pitches = event.get("pitches") or []
|
|
first_pitch_no = current_pitches[0].get("pitchNo", 0) if current_pitches else 0
|
|
is_same_batter = prev.get("batter") == event.get("batter")
|
|
|
|
if first_pitch_no > 1 or is_same_batter:
|
|
prev["pitches"].extend(current_pitches)
|
|
if event.get("result"):
|
|
prev["result"] = event["result"]
|
|
if event.get("rawText"):
|
|
current_txt = event["rawText"]
|
|
if " : " in current_txt:
|
|
current_txt = current_txt.split(" : ", 1)[1]
|
|
prev["rawText"] += " / " + current_txt
|
|
prev["runnerEvents"].extend(event.get("runnerEvents") or [])
|
|
prev["reviewEvents"].extend(event.get("reviewEvents") or [])
|
|
prev["extraEvents"].extend(event.get("extraEvents") or [])
|
|
continue
|
|
|
|
merged_events.append(event)
|
|
|
|
return {
|
|
"inning": inning,
|
|
"half": "top" if home_or_away == 0 else "bottom",
|
|
"title": title,
|
|
"events": merged_events,
|
|
}
|
|
|
|
|
|
def parse_inning_value(val: Any, default: float) -> float:
|
|
"""이닝 인수 파싱 ('1T' → 1.0, '3B' → 3.5)"""
|
|
if val is None:
|
|
return default
|
|
s = str(val).upper().strip()
|
|
if not s:
|
|
return default
|
|
m = re.match(r"^(\d+)([TB]?)$", s)
|
|
if not m:
|
|
try:
|
|
return float(s)
|
|
except ValueError:
|
|
return default
|
|
num = int(m.group(1))
|
|
suffix = m.group(2)
|
|
if suffix == "T":
|
|
return float(num)
|
|
if suffix == "B":
|
|
return num + 0.5
|
|
return float(num)
|