| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- from faster_whisper.transcribe import Segment
- def combine(whisper_segments: list[Segment], diarization_turns: list[dict]) -> list[dict]:
- raw: list[dict] = []
- for segment in whisper_segments:
- if not segment.words:
- continue
- current_speaker = None
- current_text = ""
- seg_start = segment.words[0].start
- for word in segment.words:
- mid = word.start + (word.end - word.start) / 2
- speaker = None
- for turn in diarization_turns:
- if turn["start"] <= mid < turn["end"]:
- speaker = turn["speaker"]
- break
- if not speaker:
- if current_speaker:
- speaker = current_speaker
- else:
- closest = min(
- diarization_turns,
- key=lambda t: min(abs(t["start"] - mid), abs(t["end"] - mid)),
- )
- speaker = closest["speaker"]
- if current_speaker is None:
- current_speaker = speaker
- seg_start = word.start
- current_text = word.word
- elif speaker != current_speaker:
- raw.append({"start": seg_start, "end": word.start, "speaker": current_speaker, "text": current_text.strip()})
- current_speaker = speaker
- seg_start = word.start
- current_text = word.word
- else:
- current_text += " " + word.word
- if current_text:
- raw.append({"start": seg_start, "end": segment.end, "speaker": current_speaker, "text": current_text.strip()})
- # Merge consecutive same-speaker segments within 0.5s gap
- merged: list[dict] = []
- if raw:
- curr = raw[0]
- for nxt in raw[1:]:
- if nxt["speaker"] == curr["speaker"] and (nxt["start"] - curr["end"]) < 0.5:
- curr["text"] += " " + nxt["text"]
- curr["end"] = nxt["end"]
- else:
- merged.append(curr)
- curr = nxt
- merged.append(curr)
- return merged
|