from faster_whisper.transcribe import Segment def combine(whisper_segments: list[Segment], diarization_turns: list[dict]) -> list[dict]: raw: list[dict] = [] for segment in whisper_segments: if not segment.words: continue current_speaker = None current_text = "" seg_start = segment.words[0].start for word in segment.words: mid = word.start + (word.end - word.start) / 2 speaker = None for turn in diarization_turns: if turn["start"] <= mid < turn["end"]: speaker = turn["speaker"] break if not speaker: if current_speaker: speaker = current_speaker else: closest = min( diarization_turns, key=lambda t: min(abs(t["start"] - mid), abs(t["end"] - mid)), ) speaker = closest["speaker"] if current_speaker is None: current_speaker = speaker seg_start = word.start current_text = word.word elif speaker != current_speaker: raw.append({"start": seg_start, "end": word.start, "speaker": current_speaker, "text": current_text.strip()}) current_speaker = speaker seg_start = word.start current_text = word.word else: current_text += " " + word.word if current_text: raw.append({"start": seg_start, "end": segment.end, "speaker": current_speaker, "text": current_text.strip()}) # Merge consecutive same-speaker segments within 0.5s gap merged: list[dict] = [] if raw: curr = raw[0] for nxt in raw[1:]: if nxt["speaker"] == curr["speaker"] and (nxt["start"] - curr["end"]) < 0.5: curr["text"] += " " + nxt["text"] curr["end"] = nxt["end"] else: merged.append(curr) curr = nxt merged.append(curr) return merged