Used mistral large to re-write the summarize process.

2025-09-04 19:50:29 +00:00 · 2024-08-19 07:41:32 -04:00 · 2024-08-19 07:41:32 -04:00 · 3c05cd78dd
commit 3c05cd78dd
parent 81e4e165bc
1 changed files with 38 additions and 72 deletions
--- a/summarize.py
+++ b/summarize.py
@ -21,23 +21,18 @@ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))
 TEMPERATURE = float(os.getenv("TEMPERATURE"))
 def whisper_api(file):
-    # Whisper supports multiple files, but we're sending one
+    """Transcribe audio file using Whisper API."""
    files = {"file": file}
    # Required API call data
    api_data = {
        "temperature": "0.0",
        "response_format": "json"
    }
    # Call API and return text
    response = requests.post(WHISPERCPP_URL, data=api_data, files=files)
    return response.json()["text"]
 def llama_api(prompt):
-    # Format prompt before sending
+    """Generate response using llama.cpp server API."""
    formatted_prompt = PROMPT_FORMAT.format(system=SYSTEM_MESSAGE, prompt=prompt)
    api_data = {
        "prompt": formatted_prompt,
        "n_predict": -1,
@ -45,19 +40,15 @@ def llama_api(prompt):
        "stop": [STOP_TOKEN],
        "tokens_cached": 0
    }
    response = requests.post(LLAMACPP_URL, headers={"Content-Type": "application/json"}, json=api_data)
    json_output = response.json()
    return json_output['content']
 # Use ffmpeg to trim silence in wav files, to prevent issues with 
 # whisper.cpp stopping the transcode if it detects a large amount of silence
 def trim_silence(filename):
-    # Create a temporary file for the output
+    """Trim silence from audio file using FFmpeg."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        temp_filename = temp_file.name
    # Construct the FFmpeg command
    ffmpeg_command = [
        "ffmpeg",
        "-i", filename,
@ -66,84 +57,59 @@ def trim_silence(filename):
        temp_filename
    ]
    # Run the FFmpeg command
    result = subprocess.run(ffmpeg_command, capture_output=True, text=True, check=True)
    # If FFmpeg command was successful, replace the original file
    os.replace(temp_filename, filename)
-# Iterate over each WAV file and transcode with whisper API
+def process_wav_files():
-wav_files = [f for f in os.listdir(".") if f.endswith(".wav")]
+    """Process WAV files: trim silence and transcribe."""
-for wav_file in wav_files:
+    wav_files = [f for f in os.listdir(".") if f.endswith(".wav")]
    for wav_file in wav_files:
        print("Trimming silence: " + wav_file)
        trim_silence(wav_file)
-    # Trim silence on the wav file first
+        with open(wav_file, "rb") as file:
-    print("Trimming silence: " + wav_file)
+            print("Transcribing: " + wav_file)
-    trim_silence(wav_file)
+            output_text = whisper_api(file)
-    
+            output_file = os.path.splitext(wav_file)[0] + ".tns"
-    # Open the WAV file for sending to whisper REST API
+            with open(output_file, "w") as output:
-    with open(wav_file, "rb") as file:
+                output.write(output_text)
        print("Transcribing: " + wav_file)
        # Call whisper API to transcode file
        output_text = whisper_api(file)
        # Generate the output file name by replacing the extension with .tns
        output_file = os.path.splitext(wav_file)[0] + ".tns"
        # Write the output text to the file
        with open(output_file, "w") as output:
            output.write(output_text)
 # Chunk the full transcript into multiple parts to fit in the context window
 # and allow for better reasoning capability
 def chunk_transcript(string, chunk_size):
    """Chunk the transcript to fit in the context window."""
    chunks = []
-    lines = string.split("\n")  # Split the string on newline characters
+    lines = string.split("\n")
    current_chunk = ""
    for line in lines:
-        current_chunk += line  # Build up the string until the chunk size is reached
+        current_chunk += line
        if len(current_chunk) >= chunk_size:
            chunks.append(current_chunk)
            current_chunk = ""
-    if current_chunk:  # Add the last chunk if it's not empty
+    if current_chunk:
        chunks.append(current_chunk)
    return chunks
-# Get the current date in yyyymmdd format
+def summarize_transcripts():
-today = datetime.datetime.now().strftime('%Y%m%d')
+    """Summarize transcript files."""
    today = datetime.datetime.now().strftime('%Y%m%d')
    summary_filename = "summary-" + today + ".md"
    transcript_files = [f for f in os.listdir(".") if f.endswith(".tns")]
-# Modify the filename by appending the current date
+    for transcript in transcript_files:
-summary_filename = "summary-" + today + ".md"
+        print("Summarizing: " + transcript)
        with open(transcript, "r") as file:
            transcript_data = file.read()
            chunked_data = chunk_transcript(transcript_data, CHUNK_SIZE)
 # Get the list of transcript files in the current directory
 transcript_files = [f for f in os.listdir(".") if f.endswith(".tns")]
 # Iterate over each WAV file
 for transcript in transcript_files: 
    print("Summarizing: " + transcript)
    # Open the WAV file
    with open(transcript, "r") as file:
        transcript_data = file.read()
        # chunk the transcript so we don't blow out the context window
        chunked_data = chunk_transcript(transcript_data, CHUNK_SIZE)
        # Iterate through the chunks, and summarize them
        for i, chunk in enumerate(chunked_data):
            with open(summary_filename, "a") as md_file:
-                # Generate call summary
+                for i, chunk in enumerate(chunked_data):
-                summary_prompt = SUMMARY_PROMPT.format(chunk=chunk)
+                    summary = llama_api(SUMMARY_PROMPT.format(chunk=chunk))
-                summary = llama_api(summary_prompt)
+                    facts = llama_api(FACT_PROMPT.format(chunk=chunk))
                    sentiment = llama_api(SENTIMENT_PROMPT.format(chunk=chunk))
-                # Generate fact summary
+                    md_file.write(f"# Call Transcript - {transcript} - Part {i + 1}\n\nSummary: {summary}\n\nFacts:\n{facts}\n\nSentiment: {sentiment}\n\n---\n")
                fact_prompt = FACT_PROMPT.format(chunk=chunk)
                facts = llama_api(fact_prompt)
-                # Generate call sentiment
+    print("Summarizing complete")
                sentiment_prompt = SENTIMENT_PROMPT.format(chunk=chunk)
                sentiment = llama_api(sentiment_prompt)
-                # Write the notes
+if __name__ == "__main__":
-                md_file.write(f"# Call Transcript - {transcript} - Part {i + 1}\n\nSummary: {summary}\n\nFacts:\n{facts}\n\nSentiment: {sentiment}\n\n---\n")
+    process_wav_files()
-
+    summarize_transcripts()
 print("Summarizing complete")