816 lines
27 KiB
XML
816 lines
27 KiB
XML
|
<?xml version='1.0'?>
|
||
|
<!DOCTYPE rfc SYSTEM 'rfc2629.dtd'>
|
||
|
<?rfc toc="yes" ?>
|
||
|
|
||
|
<rfc ipr="full3667" docName="RTP Payload Format for the Speex Codec">
|
||
|
|
||
|
<front>
|
||
|
<title>draft-herlein-speex-rtp-profile-03</title>
|
||
|
|
||
|
<author initials="G" surname="Herlein" fullname="Greg Herlein">
|
||
|
<organization></organization>
|
||
|
<address>
|
||
|
<email>gherlein@herlein.com</email>
|
||
|
<postal>
|
||
|
<street>2034 Filbert Street</street>
|
||
|
<city>San Francisco</city>
|
||
|
<region>California</region>
|
||
|
<code>94123</code>
|
||
|
<country>United States</country>
|
||
|
</postal>
|
||
|
</address>
|
||
|
</author>
|
||
|
|
||
|
<author initials="S" surname="Morlat" fullname="Simon Morlat">
|
||
|
<address>
|
||
|
<email>simon.morlat@linphone.org</email>
|
||
|
<postal>
|
||
|
<street>35, av de Vizille App 42</street>
|
||
|
<city>Grenoble</city>
|
||
|
<code>38000</code>
|
||
|
<country>France</country>
|
||
|
</postal>
|
||
|
</address>
|
||
|
</author>
|
||
|
|
||
|
<author initials="J" surname="Jean-Marc" fullname="Jean-Marc Valin">
|
||
|
<address>
|
||
|
<email>jean-marc.valin@hermes.usherb.ca</email>
|
||
|
<postal>
|
||
|
<street>Department of Electrical and Computer Engineering</street>
|
||
|
<street>University of Sherbrooke</street>
|
||
|
<street>2500 blvd Universite</street>
|
||
|
<city>Sherbrooke</city>
|
||
|
<region>Quebec</region>
|
||
|
<code>J1K 2R1</code>
|
||
|
<country>Canada</country>
|
||
|
</postal>
|
||
|
</address>
|
||
|
</author>
|
||
|
|
||
|
<author initials="R" surname="Hardiman" fullname="Roger Hardiman">
|
||
|
<address>
|
||
|
<email>roger@freebsd.org</email>
|
||
|
<postal>
|
||
|
<street>49 Nettleton Road</street>
|
||
|
<city>Cheltenham</city>
|
||
|
<region>Gloucestershire</region>
|
||
|
<code>GL51 6NR</code>
|
||
|
<country>England</country>
|
||
|
</postal>
|
||
|
</address>
|
||
|
</author>
|
||
|
|
||
|
|
||
|
<author initials="P" surname="Kerr" fullname="Phil Kerr">
|
||
|
<address>
|
||
|
<email>phil@plus24.com</email>
|
||
|
<postal>
|
||
|
<country>England</country>
|
||
|
</postal>
|
||
|
</address>
|
||
|
</author>
|
||
|
|
||
|
<date day="01" month="January" year="2005" />
|
||
|
|
||
|
<area>General</area>
|
||
|
<workgroup>AVT Working Group</workgroup>
|
||
|
<keyword>I-D</keyword>
|
||
|
|
||
|
<keyword>Internet-Draft</keyword>
|
||
|
<keyword>Speex</keyword>
|
||
|
<keyword>RTP</keyword>
|
||
|
<abstract>
|
||
|
<t>
|
||
|
Speex is an open-source voice codec suitable for use in Voice over
|
||
|
IP (VoIP) type applications. This document describes the payload
|
||
|
format for Speex generated bit streams within an RTP packet. Also
|
||
|
included here are the necessary details for the use of Speex with
|
||
|
the Session Description Protocol (SDP) and a preliminary method of
|
||
|
using Speex within H.323 applications.
|
||
|
</t>
|
||
|
</abstract>
|
||
|
</front>
|
||
|
|
||
|
<middle>
|
||
|
|
||
|
<section anchor="Conventions used in this document" title="Conventions used in this document">
|
||
|
<t>
|
||
|
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
|
||
|
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
|
||
|
document are to be interpreted as described in RFC 2119 <xref target="rfc2119"></xref>.
|
||
|
</t>
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Overview of the Speex Codec" title="Overview of the Speex Codec">
|
||
|
|
||
|
<t>
|
||
|
Speex is based on the CELP <xref target="CELP"></xref> encoding technique with support for
|
||
|
either narrowband (nominal 8kHz), wideband (nominal 16kHz) or
|
||
|
ultra-wideband (nominal 32kHz), and (non-optimal) rates up to 48 kHz
|
||
|
sampling also available. The main characteristics can be summarized
|
||
|
as follows:
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
<list style="symbols">
|
||
|
<t>Free software/open-source</t>
|
||
|
<t>Integration of wideband and narrowband in the same bit-stream</t>
|
||
|
<t>Wide range of bit-rates available</t>
|
||
|
<t>Dynamic bit-rate switching and variable bit-rate (VBR)</t>
|
||
|
<t>Voice Activity Detection (VAD, integrated with VBR)</t>
|
||
|
<t>Variable complexity</t>
|
||
|
</list>
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="RTP payload format for Speex" title="RTP payload format for Speex">
|
||
|
|
||
|
<t>
|
||
|
For RTP based transportation of Speex encoded audio the standard
|
||
|
RTP header [2] is followed by one or more payload data blocks.
|
||
|
An optional padding terminator may also be used.
|
||
|
</t>
|
||
|
<artwork><![CDATA[
|
||
|
0 1 2 3
|
||
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| RTP Header |
|
||
|
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
||
|
| one or more frames of Speex .... |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| one or more frames of Speex .... | padding |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
]]></artwork>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="RTP Header" title="RTP Header">
|
||
|
|
||
|
<artwork><![CDATA[
|
||
|
0 1 2 3
|
||
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
|V=2|P|X| CC |M| PT | sequence number |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| timestamp |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| synchronization source (SSRC) identifier |
|
||
|
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
||
|
| contributing source (CSRC) identifiers |
|
||
|
| ... |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
]]></artwork>
|
||
|
|
||
|
<t>
|
||
|
The RTP header begins with an octet of fields (V, P, X, and CC) to
|
||
|
support specialized RTP uses (see <xref target="rfc3550"></xref> and <xref target="rfc3551"></xref> for details). For
|
||
|
Speex the following values are used.
|
||
|
</t>
|
||
|
|
||
|
<t>Version (V): 2 bits</t><t>
|
||
|
This field identifies the version of RTP. The version
|
||
|
used by this specification is two <xref target="rfc3550"></xref>.
|
||
|
</t>
|
||
|
|
||
|
<t>Padding (P): 1 bit</t><t>
|
||
|
If the padding bit is set, the packet contains one or more
|
||
|
additional padding octets at the end which are not part of
|
||
|
the payload. P is set if the total packet size is less than
|
||
|
the MTU.
|
||
|
</t>
|
||
|
|
||
|
<t>Extension (X): 1 bit</t><t>
|
||
|
If the extension, X, bit is set, the fixed header MUST be
|
||
|
followed by exactly one header extension, with a format defined
|
||
|
in Section 5.3.1. of <xref target="rfc3550"></xref>.
|
||
|
</t>
|
||
|
|
||
|
<t>CSRC count (CC): 4 bits</t><t>
|
||
|
The CSRC count contains the number of CSRC identifiers.
|
||
|
</t>
|
||
|
|
||
|
<t>Marker (M): 1 bit</t><t>
|
||
|
The M bit indicates if the packet contains comfort noise. This
|
||
|
field is used in conjunction with the cng SDP attribute and is
|
||
|
detailed further in section 5 below. In normal usage this bit
|
||
|
is set if the packet contains comfort noise.
|
||
|
</t>
|
||
|
|
||
|
<t>Payload Type (PT): 7 bits</t><t>
|
||
|
An RTP profile for a class of applications is expected to assign
|
||
|
a payload type for this format, or a dynamically allocated
|
||
|
payload type SHOULD be chosen which designates the payload as
|
||
|
Speex.
|
||
|
</t>
|
||
|
|
||
|
<t>Sequence number: 16 bits</t><t>
|
||
|
The sequence number increments by one for each RTP data packet
|
||
|
sent, and may be used by the receiver to detect packet loss and
|
||
|
to restore packet sequence. This field is detailed further in
|
||
|
<xref target="rfc3550"></xref>.
|
||
|
</t>
|
||
|
|
||
|
<t>Timestamp: 32 bits</t><t>
|
||
|
A timestamp representing the sampling time of the first sample of
|
||
|
the first Speex packet in the RTP packet. The clock frequency
|
||
|
MUST be set to the sample rate of the encoded audio data.
|
||
|
|
||
|
Speex uses 20 msec frames and a variable sampling rate clock.
|
||
|
The RTP timestamp MUST be in units of 1/X of a second where X
|
||
|
is the sample rate used. Speex uses a nominal 8kHz sampling rate
|
||
|
for narrowband use, a nominal 16kHz sampling rate for wideband use,
|
||
|
and a nominal 32kHz sampling rate for ultra-wideband use.
|
||
|
</t>
|
||
|
|
||
|
<t>SSRC/CSRC identifiers:</t><t>
|
||
|
These two fields, 32 bits each with one SSRC field and a maximum
|
||
|
of 16 CSRC fields, are as defined in <xref target="rfc3550"></xref>.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Speex payload" title="Speex payload">
|
||
|
|
||
|
<t>
|
||
|
For the purposes of packetizing the bit stream in RTP, it is only
|
||
|
necessary to consider the sequence of bits as output by the Speex
|
||
|
encoder <xref target="speexenc"></xref>, and present the same sequence to the decoder. The
|
||
|
payload format described here maintains this sequence.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
A typical Speex frame, encoded at the maximum bitrate, is approx.
|
||
|
110 octets and the total number of Speex frames SHOULD be kept
|
||
|
less than the path MTU to prevent fragmentation. Speex frames MUST
|
||
|
NOT be fragmented across multiple RTP packets,
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
An RTP packet MAY contain Speex frames of the same bit rate or of
|
||
|
varying bit rates, since the bit-rate for a frame is conveyed in
|
||
|
band with the signal.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
The encoding and decoding algorithm can change the bit rate at any
|
||
|
20 msec frame boundary, with the bit rate change notification provided
|
||
|
in-band with the bit stream. Each frame contains both "mode"
|
||
|
(narrowband, wideband or ultra-wideband) and "sub-mode" (bit-rate)
|
||
|
information in the bit stream. No out-of-band notification is
|
||
|
required for the decoder to process changes in the bit rate sent
|
||
|
by the encoder.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
It is RECOMMENDED that values of 8000, 16000 and 32000 be used
|
||
|
for normal internet telephony applications, though the sample
|
||
|
rate is supported at rates as low as 6000 Hz and as high as
|
||
|
48 kHz.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
The RTP payload MUST be padded to provide an integer number of
|
||
|
octets as the payload length. These padding bits are LSB aligned
|
||
|
in network octet order and consist of a 0 followed by all ones
|
||
|
(until the end of the octet). This padding is only required for
|
||
|
the last frame in the packet, and only to ensure the packet
|
||
|
contents ends on an octet boundary.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Example Speex packet" title="Example Speex packet">
|
||
|
|
||
|
<t>
|
||
|
In the example below we have a single Speex frame with 5 bits
|
||
|
of padding to ensure the packet size falls on an octet boundary.
|
||
|
</t>
|
||
|
|
||
|
<artwork><![CDATA[
|
||
|
0 1 2 3
|
||
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
|V=2|P|X| CC |M| PT | sequence number |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| timestamp |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| synchronization source (SSRC) identifier |
|
||
|
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
||
|
|
||
|
0 1 2 3
|
||
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||
|
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
||
|
| contributing source (CSRC) identifiers |
|
||
|
| ... |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| ..speex data.. |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| ..speex data.. |0 1 1 1 1|
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
]]></artwork>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Multiple Speex frames in a RTP packet" title="Multiple Speex frames in a RTP packet">
|
||
|
|
||
|
<t>
|
||
|
Below is an example of two Speex frames contained within one RTP
|
||
|
packet. The Speex frame length in this example fall on an octet
|
||
|
boundary so there is no padding.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
Speex codecs <xref target="speexenc"></xref> are able to detect the the bitrate from the
|
||
|
payload and are responsible for detecting the 20 msec boundaries
|
||
|
between each frame.
|
||
|
</t>
|
||
|
|
||
|
<artwork><![CDATA[
|
||
|
0 1 2 3
|
||
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
|V=2|P|X| CC |M| PT | sequence number |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| timestamp |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| synchronization source (SSRC) identifier |
|
||
|
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
||
|
| contributing source (CSRC) identifiers |
|
||
|
| ... |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| ..speex data.. |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| ..speex data.. | ..speex data.. |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
| ..speex data.. |
|
||
|
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||
|
]]></artwork>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="MIME registration of Speex" title="MIME registration of Speex">
|
||
|
|
||
|
<t>
|
||
|
Full definition of the MIME <xref target="rfc2045"></xref> type for Speex will be part of the Ogg
|
||
|
Vorbis MIME type definition application <xref target="rfc3534"></xref>.
|
||
|
</t>
|
||
|
|
||
|
<t>MIME media type name: audio</t>
|
||
|
|
||
|
<t>MIME subtype: speex</t>
|
||
|
|
||
|
<t>Optional parameters:</t>
|
||
|
|
||
|
<t>Required parameters: to be included in the Ogg MIME specification.</t>
|
||
|
|
||
|
<t>Encoding considerations:</t>
|
||
|
|
||
|
<t>Security Considerations:</t>
|
||
|
<t>See Section 6 of RFC 3047.</t>
|
||
|
|
||
|
<t>Interoperability considerations: none</t>
|
||
|
|
||
|
<t>Published specification: </t>
|
||
|
|
||
|
<t>Applications which use this media type:</t>
|
||
|
|
||
|
<t>Additional information: none</t>
|
||
|
|
||
|
<t>Person & email address to contact for further information:<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>Greg Herlein <gherlein@herlein.com></t>
|
||
|
<t>Jean-Marc Valin <jean-marc.valin@hermes.usherb.ca></t>
|
||
|
</list>
|
||
|
</t>
|
||
|
|
||
|
<t>Intended usage: COMMON</t>
|
||
|
|
||
|
<t>Author/Change controller:</t>
|
||
|
|
||
|
<t>
|
||
|
<list style="empty">
|
||
|
<t>Author: Greg Herlein <gherlein@herlein.com></t>
|
||
|
<t>Change controller: Greg Herlein <gherlein@herlein.com></t>
|
||
|
<t>Change controller: IETF AVT Working Group</t>
|
||
|
</list>
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
This transport type signifies that the content is to be interpreted
|
||
|
according to this document if the contents are transmitted over RTP.
|
||
|
Should this transport type appear over a lossless streaming protocol
|
||
|
such as TCP, the content encapsulation should be interpreted as an
|
||
|
Ogg Stream in accordance with <xref target="rfc3534"></xref>, with the exception that the
|
||
|
content of the Ogg Stream may be assumed to be Speex audio and
|
||
|
Speex audio only.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="SDP usage of Speex" title="SDP usage of Speex">
|
||
|
|
||
|
<t>
|
||
|
When conveying information by SDP <xref target="rfc2327"></xref>, the encoding name MUST be
|
||
|
set to "speex". An example of the media representation in SDP for
|
||
|
offering a single channel of Speex at 8000 samples per second might
|
||
|
be:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8088 RTP/AVP 97</t>
|
||
|
<t>a=rtpmap:97 speex/8000</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
Note that the RTP payload type code of 97 is defined in this media
|
||
|
definition to be 'mapped' to the speex codec at an 8kHz sampling
|
||
|
frequency using the 'a=rtpmap' line. Any number from 96 to 127
|
||
|
could have been chosen (the allowed range for dynamic types).
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
The value of the sampling frequency is typically 8000 for narrow band
|
||
|
operation, 16000 for wide band operation, and 32000 for ultra-wide
|
||
|
band operation.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
If for some reason the offerer has bandwidth limitations, the client
|
||
|
may use the "b=" header, as explained in SDP <xref target="rfc2327"></xref>. The following example
|
||
|
illustrates the case where the offerer cannot receive more than
|
||
|
10 kbit/s.
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8088 RTP/AVP 97</t>
|
||
|
<t>b=AS:10</t>
|
||
|
<t>a=rtmap:97 speex/8000</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
In this case, if the remote part agrees, it should configure its
|
||
|
Speex encoder so that it does not use modes that produce more than
|
||
|
10 kbit/s. Note that the "b=" constraint also applies on all
|
||
|
payload types that may be proposed in the media line ("m=").
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
An other way to make recommendations to the remote Speex encoder
|
||
|
is to use its specific parameters via the a=fmtp: directive. The
|
||
|
following parameters are defined for use in this way:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>ptime: duration of each packet in milliseconds.<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>sr: actual sample rate in Hz.<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>ebw: encoding bandwidth - either 'narrow' or 'wide' or
|
||
|
'ultra' (corresponds to nominal 8000, 16000, and
|
||
|
32000 Hz sampling rates).<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>vbr: variable bit rate - either 'on' 'off' or 'vad'
|
||
|
(defaults to off). If on, variable bit rate is
|
||
|
enabled. If off, disabled. If set to 'vad' then
|
||
|
constant bit rate is used but silence will be encoded
|
||
|
with special short frames to indicate a lack of voice
|
||
|
for that period.<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>cng: comfort noise generation - either 'on' or 'off'. If
|
||
|
off then silence frames will be silent; if 'on' then
|
||
|
those frames will be filled with comfort noise.<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>mode: Speex encoding mode. Can be {1,2,3,4,5,6,any}
|
||
|
defaults to 3 in narrowband, 6 in wide and ultra-wide.<vspace blankLines="1" /></t>
|
||
|
|
||
|
<t>penh: use of perceptual enhancement. 1 indicates
|
||
|
to the decoder that perceptual enhancement is recommended,
|
||
|
0 indicates that it is not. Defaults to on (1).<vspace blankLines="1" /></t>
|
||
|
</list>
|
||
|
|
||
|
<t>Examples:</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8008 RTP/AVP 97</t>
|
||
|
<t>a=rtpmap:97 speex/8000</t>
|
||
|
<t>a=fmtp:97 mode=4</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
This examples illustrate an offerer that wishes to receive
|
||
|
a Speex stream at 8000Hz, but only using speex mode 3.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
The offerer may suggest to the remote decoder to activate
|
||
|
its perceptual enhancement filter like this:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8088 RTP/AVP 97</t>
|
||
|
<t>a=rtmap:97 speex/8000</t>
|
||
|
<t>a=fmtp:97 penh=1 </t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
Several Speex specific parameters can be given in a single
|
||
|
a=fmtp line provided that they are separated by a semi-colon:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>a=fmtp:97 mode=any;penh=1</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
The offerer may indicate that it wishes to send variable bit rate
|
||
|
frames with comfort noise:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8088 RTP/AVP 97</t>
|
||
|
<t>a=rtmap:97 speex/8000</t>
|
||
|
<t>a=fmtp:97 vbr=on;cng=on</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
The "ptime" attribute is used to denote the packetization
|
||
|
interval (ie, how many milliseconds of audio is encoded in a
|
||
|
single RTP packet). Since Speex uses 20 msec frames, ptime values
|
||
|
of multiples of 20 denote multiple Speex frames per packet.
|
||
|
Values of ptime which are not multiples of 20 MUST be ignored
|
||
|
and clients MUST use the default value of 20 instead.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
In the example below the ptime value is set to 40, indicating that
|
||
|
there are 2 frames in each packet.
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>m=audio 8008 RTP/AVP 97</t>
|
||
|
<t>a=rtpmap:97 speex/8000</t>
|
||
|
<t>a=ptime:40</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
Note that the ptime parameter applies to all payloads listed
|
||
|
in the media line and is not used as part of an a=fmtp directive.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
Values of ptime not multiple of 20 msec are meaningless, so the
|
||
|
receiver of such ptime values MUST ignore them. If during the
|
||
|
life of an RTP session the ptime value changes, when there are
|
||
|
multiple Speex frames for example, the SDP value must also reflect
|
||
|
the new value.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
Care must be taken when setting the value of ptime so that the
|
||
|
RTP packet size does not exceed the path MTU.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
<section anchor="ITU H.323/H.245 Use of Speex" title="ITU H.323/H.245 Use of Speex">
|
||
|
|
||
|
<t>
|
||
|
Application is underway to make Speex a standard ITU codec.
|
||
|
However, until that is finalized, Speex MAY be used in H.323 <xref target="H323"></xref> by
|
||
|
using a non-standard codec block definition in the H.245 <xref target="H245"></xref> codec
|
||
|
capability negotiations.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="NonStandardMessage format" title="NonStandardMessage format">
|
||
|
|
||
|
<t>
|
||
|
For Speex use in H.245 <xref target="H245"></xref> based systems, the fields in the
|
||
|
NonStandardMessage should be:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>t35CountryCode = Hex: B5</t>
|
||
|
<t>t35Extension = Hex: 00</t>
|
||
|
<t>manufacturerCode = Hex: 0026</t>
|
||
|
<t>[Length of the Binary Sequence (8 bit number)]</t>
|
||
|
<t>[Binary Sequence consisting of an ASCII string, no NULL terminator]</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
The binary sequence is an ascii string merely for ease of use.
|
||
|
The string is not null terminated. The format of this string is
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>speex [optional variables]</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
The optional variables are identical to those used for the SDP
|
||
|
a=fmtp strings discussed in section 5 above. The string is built
|
||
|
to be all on one line, each key-value pair separated by a
|
||
|
semi-colon. The optional variables MAY be omitted, which causes
|
||
|
the default values to be assumed. They are:
|
||
|
</t>
|
||
|
|
||
|
<vspace blankLines="1" />
|
||
|
<list style="empty">
|
||
|
<t>ebw=narrow;mode=3;vbr=off;cng=off;ptime=20;sr=8000;penh=no;</t>
|
||
|
</list>
|
||
|
|
||
|
<t>
|
||
|
The fifth octet of the block is the length of the binary sequence.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
NOTE: this method can result in the advertising of a large number
|
||
|
of Speex 'codecs' based on the number of variables possible. For
|
||
|
most VoIP applications, use of the default binary sequence of
|
||
|
'speex' is RECOMMENDED to be used in addition to all other options.
|
||
|
This maximizes the chances that two H.323 based applications that
|
||
|
support Speex can find a mutual codec.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="RTP Payload Types" title="RTP Payload Types">
|
||
|
|
||
|
<t>
|
||
|
Dynamic payload type codes MUST be negotiated 'out-of-band'
|
||
|
for the assignment of a dynamic payload type from the
|
||
|
range of 96-127. H.323 applications MUST use the H.245
|
||
|
H2250LogicalChannelParameters encoding to accomplish this.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Security Considerations" title="Security Considerations">
|
||
|
|
||
|
<t>
|
||
|
RTP packets using the payload format defined in this specification
|
||
|
are subject to the security considerations discussed in the RTP
|
||
|
specification <xref target="rfc3550"></xref>, and any appropriate RTP profile. This implies
|
||
|
that confidentiality of the media streams is achieved by encryption.
|
||
|
Because the data compression used with this payload format is applied
|
||
|
end-to-end, encryption may be performed after compression so there is
|
||
|
no conflict between the two operations.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
A potential denial-of-service threat exists for data encodings using
|
||
|
compression techniques that have non-uniform receiver-end
|
||
|
computational load. The attacker can inject pathological datagrams
|
||
|
into the stream which are complex to decode and cause the receiver to
|
||
|
be overloaded. However, this encoding does not exhibit any
|
||
|
significant non-uniformity.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
As with any IP-based protocol, in some circumstances a receiver may
|
||
|
be overloaded simply by the receipt of too many packets, either
|
||
|
desired or undesired. Network-layer authentication may be used to
|
||
|
discard packets from undesired sources, but the processing cost of
|
||
|
the authentication itself may be too high.
|
||
|
</t>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section anchor="Acknowledgments" title="Acknowledgments">
|
||
|
|
||
|
<t>
|
||
|
The authors would like to thank Equivalence Pty Ltd of Australia
|
||
|
for their assistance in attempting to standardize the use of Speex
|
||
|
in H.323 applications, and for implementing Speex in their open
|
||
|
source OpenH323 stack. The authors would also like to thank Brian
|
||
|
C. Wiles <brian@streamcomm.com> of StreamComm for his assistance in
|
||
|
developing the proposed standard for Speex use in H.323
|
||
|
applications.
|
||
|
</t>
|
||
|
|
||
|
<t>
|
||
|
The authors would also like to thank the following members of the
|
||
|
Speex and AVT communities for their input: Ross Finlayson,
|
||
|
Federico Montesino Pouzols, Henning Schulzrinne, Magnus Westerlund.
|
||
|
</t>
|
||
|
</section>
|
||
|
|
||
|
</middle>
|
||
|
|
||
|
<back>
|
||
|
|
||
|
<references title="Normative References">
|
||
|
|
||
|
<reference anchor="rfc2119">
|
||
|
<front>
|
||
|
<title>Key words for use in RFCs to Indicate Requirement Levels </title>
|
||
|
<author initials="S." surname="Bradner" fullname="Scott Bradner"></author>
|
||
|
</front>
|
||
|
<seriesInfo name="RFC" value="2119" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="rfc3550">
|
||
|
<front>
|
||
|
<title>RTP: A Transport Protocol for real-time applications</title>
|
||
|
<author initials="H." surname="Schulzrinne" fullname=""></author>
|
||
|
<author initials="S." surname="Casner" fullname=""></author>
|
||
|
<author initials="R." surname="Frederick" fullname=""></author>
|
||
|
<author initials="V." surname="Jacobson" fullname=""></author>
|
||
|
</front>
|
||
|
<seriesInfo name="RFC" value="3550" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="rfc2045">
|
||
|
<front>
|
||
|
<title>Multipurpose Internet Mail Extensions (MIME) Part One: Format of Internet Message Bodies</title>
|
||
|
<author initials="" surname="" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="November" year="1998" />
|
||
|
<seriesInfo name="RFC" value="2045" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="rfc2327">
|
||
|
<front>
|
||
|
<title>SDP: Session Description Protocol</title>
|
||
|
<author initials="V." surname="Jacobson" fullname=""></author>
|
||
|
<author initials="M." surname="Handley" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="April" year="1998" />
|
||
|
<seriesInfo name="RFC" value="2327" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="H323">
|
||
|
<front>
|
||
|
<title>Packet-based Multimedia Communications Systems</title>
|
||
|
<author initials="" surname="" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="" year="1998" />
|
||
|
<seriesInfo name="ITU-T Recommendation" value="H.323" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="H245">
|
||
|
<front>
|
||
|
<title>Control of communications between Visual Telephone Systems and Terminal Equipment</title>
|
||
|
<author initials="" surname="" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="" year="1998" />
|
||
|
<seriesInfo name="ITU-T Recommendation" value="H.245" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="rfc3551">
|
||
|
<front>
|
||
|
<title>RTP Profile for Audio and Video Conferences with Minimal Control.</title>
|
||
|
<author initials="H." surname="Schulzrinne" fullname=""></author>
|
||
|
<author initials="S." surname="Casner" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="July" year="2003" />
|
||
|
<seriesInfo name="RFC" value="3551" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="rfc3534">
|
||
|
<front>
|
||
|
<title>The application/ogg Media Type</title>
|
||
|
<author initials="L." surname="Walleij" fullname=""></author>
|
||
|
</front>
|
||
|
<date month="May" year="2003" />
|
||
|
<seriesInfo name="RFC" value="3534" />
|
||
|
</reference>
|
||
|
|
||
|
</references>
|
||
|
|
||
|
<references title="Informative References">
|
||
|
|
||
|
<reference anchor="speexenc">
|
||
|
<front>
|
||
|
<title>Speexenc/speexdec, reference command-line encoder/decoder</title>
|
||
|
</front>
|
||
|
<seriesInfo name="Speex website" value="http://www.speex.org/" />
|
||
|
</reference>
|
||
|
|
||
|
<reference anchor="CELP">
|
||
|
<front>
|
||
|
<title>CELP, U.S. Federal Standard 1016.</title>
|
||
|
<author initials="" surname="" fullname=""></author>
|
||
|
</front>
|
||
|
<seriesInfo name="National Technical Information Service (NTIS) website" value="http://www.ntis.gov/" />
|
||
|
</reference>
|
||
|
|
||
|
</references>
|
||
|
|
||
|
</back>
|
||
|
</rfc>
|