#!/usr/local/bin/perl -w
#
# Author: Adam Janin
# adam@janin.org
# Copyright (C) 2003 International Computer Science Institute
#
# For more information on the ICSI Meeting Corpus, see:
# http://www.icsi.berkeley.edu/Speech/mr
#
# This script is provided AS IS. It has not been extensively tested,
# and we offer no support or warranty.
#
# Convert from MRT format to an STM-like list format.
# See usage() at the bottom of this file for details.
#
# Requires XML::Parser.
#
# RCS $Header: /n/www/export/htdocs/speech/mr/tools/RCS/mrt2list,v 1.2 2004/04/29 20:16:47 janin Exp $
use strict;
use FileHandle;
use XML::Parser;
# Globals
my %Channel; # Map of SpeakerID => default channel.
my $InSegment; # If 1, currently parsing a ... .
my $TextSoFar; # Stored text transcript of particular segment.
my $Header; # Stored meta information of particular segment.
my $FileName; # File given on command line.
my $Mid; # Meeting ID according to MRT file.
my $Parser; # The XML::Parser instance.
$FileName = shift or usage();
$InSegment = 0;
$Parser = new XML::Parser(Handlers => { Start => \&StartTag,
End => \&EndTag,
Char => \&Text});
$Parser->parsefile($FileName);
sub StartTag {
my($expat, $elem, %attr) = @_;
if ($elem eq "Participant") {
$Channel{$attr{Name}} = $attr{Channel};
} elsif ($elem eq "Meeting") {
$Mid = $attr{Session};
} elsif ($elem eq "Segment") {
my($sid, $chan);
# Figure out which channel to use.
# If the Segment has a Channel attribute, just use it
if (exists($attr{Channel})) {
$chan = $attr{Channel};
# If the Segment does NOT have a Channel attribute, but does
# have a Participant, and CloseMic is true, use the default
# channel for the participant.
} elsif (exists($attr{Participant}) &&
(!exists($attr{CloseMic}) || $attr{CloseMic} eq "true")) {
$chan = $Channel{$attr{Participant}};
} else {
# Otherwise, just use "far"
$chan = 'far';
}
# Get the Participant attribute, or "none" if none provided.
if (exists($attr{Participant})) {
$sid = $attr{Participant};
} else {
$sid = "none";
}
# Store the header so we can print it out when we hit the
$Header = "$Mid $sid $chan $attr{StartTime} $attr{EndTime}";
# Start collecting up text
$InSegment = 1;
$TextSoFar = '';
# Any other tag is passed verbatim if we're in a ...
} elsif ($InSegment) {
$TextSoFar .= " " . $expat->original_string();
}
}
sub EndTag {
my($expat, $elem) = @_;
if ($elem eq "Segment") {
# Clean everything up and print it out
$TextSoFar =~ s/\n/ /g;
$TextSoFar =~ s/\s\s+/ /g;
$TextSoFar =~ s/^\s+//;
$TextSoFar =~ s/\s+$//;
if ($TextSoFar !~ /^\s*$/) {
print "$Header $TextSoFar\n";
}
$InSegment = 0;
$TextSoFar = '';
} elsif ($InSegment) {
$TextSoFar .= " " . $expat->original_string();
}
}
sub Text {
my($expat, $string) = @_;
if ($InSegment) {
$TextSoFar .= $string;
}
}
sub usage {
print<<"EndOfUsage";
Usage: mrt2list file.mrt
Convert from MRT format to an STM-like list format. Much of the meta
data is dropped in this conversion, but it is easier to sort and grep
than the MRT format.
You may use \"mrt2list -\" to read from stdin instead of a file.
Output is one line per segment. Segments have the following format:
mid sid chan start end word1 word2 ...
mid - Meeting ID (e.g. Bmr023)
sid - Speaker ID (e.g. me011) or \"none\" if no speaker specified
chan - Channel (e.g. chan0) or \"far\" if not associated with a near field mic
start - Start time in seconds
end - End time in seconds
word1 word2 ... - The words exactly as they appear in the MRT file
Examples from \"mrt2list Bsr001.mrt\":
Bsr001 fe016 chan9 0.010 0.996 I was just thinking that
Bsr001 me055 chan1 0.320 2.740
Bsr001 none far 10.832 21.800 @@
Bsr001 mn057 far 26.438 30.234 If you need it
Typically, you will want to either remove the embedded tags in the
segments or convert them from XML to the textual transcriber\'s
conventions using the mrt_tag tool.
EndOfUsage
exit();
}