""" crawler/relay_parser.py — relay 데이터 파싱 네이버 textRelays를 분석하여 이닝별/타석별 구조화된 이벤트로 변환합니다. """ from __future__ import annotations import re from collections import defaultdict from typing import Any from core.config_loader import ( skip_option_types, hidden_event_texts, change_keywords, max_inning, ) from core.review_parser import parse_review_event_text # ────────────────────────────────────────────── # 정렬 키 # ────────────────────────────────────────────── def _option_seqno(option: dict[str, Any]) -> int: return int(option.get("seqno", -1)) def _relay_seqno(relay: dict[str, Any]) -> int: seqnos = [ _option_seqno(opt) for opt in relay.get("textOptions", []) if opt.get("seqno") is not None ] return min(seqnos) if seqnos else -1 # ────────────────────────────────────────────── # 제목 추출 # ────────────────────────────────────────────── def get_half_inning_title( relays: list[dict[str, Any]], inning: int, home_or_away: int, ) -> str: """이닝 시작 릴레이에서 제목 추출""" for relay in relays: for opt in relay.get("textOptions", []): if opt.get("type") == 0: return opt.get("text", "").strip() half_label = "초" if home_or_away == 0 else "말" return f"{inning}회{half_label}" def _get_batter_title(relay: dict[str, Any], options: list[dict[str, Any]]) -> str: """릴레이 블록에서 타자 이름/제목 추출""" batter_title = next( (opt.get("text", "").strip() for opt in options if opt.get("type") == 8), "", ) if batter_title: return batter_title title = (relay.get("title") or "").strip() if title and "공격" not in title and not title.startswith("="): return title return "" # ────────────────────────────────────────────── # 투구/주루/교체 파싱 # ────────────────────────────────────────────── def _format_pitch_text(option: dict[str, Any]) -> str: """투구 옵션 → 포맷된 텍스트""" text = option.get("text", "").strip() speed = str(option.get("speed") or "").strip() stuff = str(option.get("stuff") or "").strip() details = [] if speed: details.append(f"{speed}km") if stuff: details.append(stuff) return f"{text} ({', '.join(details)})" if details else text def _classify_pitch_result(text: str, code: str | None) -> str: """투구 결과 텍스트 + 코드 → 정규화된 결과 코드""" normalized = text.replace(" ", "") if any(key in normalized for key in ("번트헛스윙", "헛스윙번트", "번트시도스트라이크")): return "BS" if any(key in normalized for key in ("번트파울", "번트파울.")): return "BF" if code in {"BS", "BF", "B", "T", "S", "F", "H"}: return code if code and code != "V": return code mapping = { "번트 헛스윙": "BS", "번트헛스윙": "BS", "번트 파울": "BF", "번트파울": "BF", "볼": "B", "스트라이크": "T", "헛스윙": "S", "파울": "F", "타격": "H", } for key, value in mapping.items(): if key in text: return value return "" def _classify_result_type(text: str) -> str: """결과 텍스트 → result.type 코드""" clean_text = text.replace(" ", "") if "낫아웃" in clean_text: return "strikeout_not_out" if "고의사구" in text: return "intentional_walk" if "볼넷" in text: return "walk" if "삼진" in text: return "strikeout" if any(k in text for k in ["몸에 맞는 볼", "몸에 맞는 공", "사구", "헤드샷"]): return "hit_by_pitch" if "홈런" in text: return "home_run" if "3루타" in text: return "triple" if "2루타" in text: return "double" if "번트안타" in text: return "bunt_hit" if "1루타" in text or "내야안타" in text: return "single" if "실책" in text and "출루" in text: return "reach_on_error" if "야수선택" in text: return "reach_on_fielder_choice" if "땅볼로 출루" in text or "땅볼출루" in text: return "reach_on_grounder" if "희생번트" in text: return "sacrifice_bunt" if "희생플라이" in text: return "sacrifice_fly" if "병살타" in text: return "double_play" if any(k in text for k in [ "플라이 아웃", "땅볼 아웃", "인필드플라이 아웃", "라인드라이브 아웃", "직선타 아웃", "라인드라이브", "직선타", ]): return "out" return "play" def _parse_runner_event(text: str) -> dict[str, Any]: """주루 이벤트 텍스트 → 구조화된 dict""" event_type = "runner_event" if "도루" in text: event_type = "steal_fail" if "실패" in text else "steal" elif "홈인" in text: event_type = "score" elif "포스아웃" in text: event_type = "force_out" elif "견제사" in text: event_type = "pickoff_out" elif "태그아웃" in text: event_type = "tag_out" elif "실책" in text: event_type = "error_advance" elif "폭투" in text: event_type = "wild_pitch_advance" elif "포일" in text: event_type = "passed_ball_advance" elif "진루" in text: event_type = "advance" from_base = None to_base = None for label, base in (("1루주자", 1), ("2루주자", 2), ("3루주자", 3), ("1루", 1), ("2루", 2), ("3루", 3)): if label in text and from_base is None: from_base = base for label, base in (("1루까지", 1), ("2루까지", 2), ("3루까지", 3)): if label in text: to_base = base if "홈인" in text: to_base = 4 runner_name = ( text.split(" : ", 1)[0] .replace("1루주자 ", "") .replace("2루주자 ", "") .replace("3루주자 ", "") .replace("대주자 ", "") .strip() ) extra_advance = 0 if "주자의 재치로" in text and from_base is not None and to_base is not None: extra_advance = max(0, to_base - from_base) # action_label: 관리자 사이트 버튼 라벨 매핑 clean_text = text.replace(" ", "") if "실책으로" in clean_text: action_label = "수비 실책" elif "도루" in clean_text: action_label = "도루성공" if "실패" not in clean_text else "도루시도 아웃" elif "폭투" in clean_text: action_label = "폭투-진루성공" elif "포일" in clean_text: action_label = "포일-진루성공" elif "태그" in clean_text: action_label = "태그아웃" elif "포스" in clean_text: action_label = "포스아웃" elif "견제" in clean_text: action_label = "견제 아웃" elif any(k in clean_text for k in ["볼넷", "포볼", "고의사구", "몸에맞는", "사구"]): action_label = "볼넷 진루" else: action_label = "일반 진루" return { "type": event_type, "runner": runner_name, "fromBase": from_base, "toBase": to_base, "extra_advance": extra_advance, "text": text, "action_label": action_label, } def _parse_change_event(text: str) -> dict[str, Any]: """교체 텍스트 → 구조화된 dict""" event: dict[str, Any] = { "event_type": "change", "change_type": "position_change" if "수비위치 변경" in text else "substitution", "text": text, } actor_role, batter_order, actor_name = _extract_change_actor(text) event["actor_role"] = actor_role event["actor_name"] = actor_name if batter_order: event["bat_order"] = int(batter_order) if "수비위치 변경" in text: to_position = text.split(" : ", 1)[1].split("(으)로", 1)[0].strip() event["player_name"] = actor_name event["to_position"] = to_position return event rhs = text.split(" : ", 1)[1].split("(으)로 교체", 1)[0].strip() in_role, _, in_name = _extract_change_actor(rhs) event["out_player"] = actor_name event["in_player"] = in_name event["in_role"] = in_role field_roles = {"투수", "포수", "1루수", "2루수", "3루수", "유격수", "좌익수", "중견수", "우익수"} if actor_role in field_roles and in_role == "투수": event["change_type"] = "merged_pitcher_substitution" event["player_name"] = actor_name event["to_position"] = "지명타자" event["pitcher_in_player"] = in_name return event extra_roles = field_roles | {"대타", "대주자"} if in_role in extra_roles: event["to_position"] = in_role if in_role not in {"대타", "대주자"} else None return event def _extract_change_actor(text: str) -> tuple[str | None, str | None, str]: """교체 텍스트에서 역할/타순/이름 추출""" lhs = text.split(" : ", 1)[0].strip() if "번타자 " in lhs: order_match = re.search(r"(\d+)번타자\s+(.+)$", lhs) if order_match: return "batter", order_match.group(1), order_match.group(2).strip() for role in ( "대타", "대주자", "1루주자", "2루주자", "3루주자", "주자", "투수", "포수", "1루수", "2루수", "3루수", "유격수", "좌익수", "중견수", "우익수", ): if lhs.startswith(role + " "): return role, None, lhs[len(role):].strip() return None, None, lhs # ────────────────────────────────────────────── # 주루 이벤트 병합 # ────────────────────────────────────────────── def _merge_runner_events(runner_events: list[dict[str, Any]]) -> list[dict[str, Any]]: """동일 주자의 이벤트를 병합""" merged: dict[str, dict[str, Any]] = {} for r in runner_events: name = r.get("runner") if not name: continue if name in merged: merged[name]["type"] = r.get("type", merged[name]["type"]) merged[name]["text"] += f" / {r.get('text', '')}" if r.get("toBase"): merged[name]["toBase"] = r["toBase"] if r.get("extra_advance"): merged[name]["extra_advance"] = r["extra_advance"] if "태그아웃" in r.get("text", "") or r.get("type") == "tag_out": merged[name]["type"] = "tag_out" else: merged[name] = dict(r) return list(merged.values()) # ────────────────────────────────────────────── # 릴레이 → 이벤트 리스트 변환 # ────────────────────────────────────────────── def build_relay_events(relay: dict[str, Any]) -> list[dict[str, Any]]: """하나의 릴레이 블록 → 타석/교체 이벤트 리스트""" skip_types = skip_option_types() hidden_texts = hidden_event_texts() chg_keywords = change_keywords() options = sorted(relay.get("textOptions", []), key=_option_seqno) # 1. 세그먼트 분리 (pitchNum 1이 새로 나오면 타자가 바뀐 것) segments: list[list[dict[str, Any]]] = [] current_segment: list[dict[str, Any]] = [] for opt in options: opt_type = opt.get("type") if opt_type == 1 and opt.get("pitchNum") == 1: if any(o.get("type") == 1 for o in current_segment): segments.append(current_segment) current_segment = [] current_segment.append(opt) if current_segment: segments.append(current_segment) # 2. 각 세그먼트별 이벤트 생성 results: list[dict[str, Any]] = [] relay_batter_title = _get_batter_title(relay, options) for i, seg_options in enumerate(segments): seg_changes: list[dict[str, Any]] = [] seg_event_texts: list[str] = [] seg_pitches: list[dict[str, Any]] = [] seg_runner_events: list[dict[str, Any]] = [] seg_review_events: list[dict[str, Any]] = [] seg_extra_events: list[dict[str, Any]] = [] seg_result_text: str | None = None seg_batter_name: str | None = next( (o.get("text", "").strip() for o in seg_options if o.get("type") == 8), None, ) for opt in seg_options: ot = opt.get("type") txt = opt.get("text", "").strip() if not txt or ot in skip_types: continue if txt in hidden_texts: continue if any(k in txt for k in chg_keywords): seg_changes.append(_parse_change_event(txt)) continue if ot == 1: seg_event_texts.append(_format_pitch_text(opt)) seg_pitches.append({ "pitchNo": opt.get("pitchNum"), "pitchResult": _classify_pitch_result(txt, opt.get("pitchResult")), "pitchResultText": txt.replace(f"{opt.get('pitchNum')}구 ", "", 1), "speedKmh": int(opt["speed"]) if opt.get("speed") not in (None, "") else None, "pitchType": opt.get("stuff"), "runnerEvents": [], }) continue if ot == 14: if seg_pitches: seg_pitches[-1]["runnerEvents"].append(_parse_runner_event(txt)) else: seg_runner_events.append(_parse_runner_event(txt)) continue if ot == 24: seg_runner_events.append(_parse_runner_event(txt)) continue seg_event_texts.append(txt) if "비디오 판독" in txt or "합의 판정" in txt: seg_review_events.append(parse_review_event_text(txt)) elif "체크스윙" in txt: seg_extra_events.append({"type": "appeal_or_judgement", "text": txt}) elif any(r in txt for r in ["1루주자", "2루주자", "3루주자", "대주자", "도루", "홈인", "포스아웃"]) or ("진루" in txt and "출루" not in txt): seg_runner_events.append(_parse_runner_event(txt)) else: seg_result_text = txt if " : " in txt and seg_batter_name is None: name_part = txt.split(" : ", 1)[0].strip() if name_part and len(name_part) < 10: seg_batter_name = name_part if not seg_batter_name: seg_batter_name = relay_batter_title if i == 0 else "" # 주루 이벤트 병합 for p in seg_pitches: p["runnerEvents"] = _merge_runner_events(p["runnerEvents"]) seg_merged_runners = _merge_runner_events(seg_runner_events) # 타자 결과 객체 res_obj = None if seg_result_text: base_type = _classify_result_type(seg_result_text) res_obj = {"type": base_type, "text": seg_result_text} b_name = seg_batter_name.split()[-1] if seg_batter_name else "" final_runners = [] for r in seg_merged_runners: if b_name and r.get("runner") == b_name: if base_type in {"single", "double", "triple"}: r_type = r.get("type", "") if r_type in {"tag_out", "force_out", "steal_fail", "pickoff_out"}: res_obj["type"] = f"{base_type}_runner_out" elif r_type == "error_advance": res_obj["type"] = f"{base_type}_error_advance" if r.get("toBase"): res_obj["toBase"] = r["toBase"] if r.get("extra_advance"): res_obj["extra_advance"] = r["extra_advance"] else: final_runners.append(r) seg_merged_runners = final_runners if seg_changes: results.extend(seg_changes) if seg_event_texts: full_txt = ( f"{seg_batter_name} : " + ", ".join(seg_event_texts) if seg_batter_name else ", ".join(seg_event_texts) ) results.append({ "event_type": "at_bat", "batter": seg_batter_name, "rawText": full_txt, "pitches": seg_pitches, "result": res_obj, "runnerEvents": seg_merged_runners, "reviewEvents": seg_review_events, "extraEvents": seg_extra_events, "changes": [], }) return results # ────────────────────────────────────────────── # 이닝 빌드 # ────────────────────────────────────────────── def build_half_inning( inning: int, home_or_away: int, relays: list[dict[str, Any]], ) -> dict[str, Any]: """한 이닝의 한 쪽(초/말) 데이터를 구성""" title = get_half_inning_title(relays, inning, home_or_away) raw_events: list[dict[str, Any]] = [] for relay in sorted(relays, key=_relay_seqno): raw_events.extend(build_relay_events(relay)) # 같은 타자의 연속 타석 병합 merged_events: list[dict[str, Any]] = [] for event in raw_events: if not merged_events or event.get("event_type") != "at_bat": merged_events.append(event) continue prev = merged_events[-1] if prev.get("event_type") != "at_bat": merged_events.append(event) continue current_pitches = event.get("pitches") or [] first_pitch_no = current_pitches[0].get("pitchNo", 0) if current_pitches else 0 is_same_batter = prev.get("batter") == event.get("batter") if first_pitch_no > 1 or is_same_batter: prev["pitches"].extend(current_pitches) if event.get("result"): prev["result"] = event["result"] if event.get("rawText"): current_txt = event["rawText"] if " : " in current_txt: current_txt = current_txt.split(" : ", 1)[1] prev["rawText"] += " / " + current_txt prev["runnerEvents"].extend(event.get("runnerEvents") or []) prev["reviewEvents"].extend(event.get("reviewEvents") or []) prev["extraEvents"].extend(event.get("extraEvents") or []) continue merged_events.append(event) return { "inning": inning, "half": "top" if home_or_away == 0 else "bottom", "title": title, "events": merged_events, } def parse_inning_value(val: Any, default: float) -> float: """이닝 인수 파싱 ('1T' → 1.0, '3B' → 3.5)""" if val is None: return default s = str(val).upper().strip() if not s: return default m = re.match(r"^(\d+)([TB]?)$", s) if not m: try: return float(s) except ValueError: return default num = int(m.group(1)) suffix = m.group(2) if suffix == "T": return float(num) if suffix == "B": return num + 0.5 return float(num)