diff mbox series

[5/6,v2] sound: Add n64 driver

Message ID 20210108103513.336e6eb9ad323feff6758e20@gmx.com (mailing list archive)
State Not Applicable
Headers show
Series None | expand

Commit Message

Lauri Kasanen Jan. 8, 2021, 8:35 a.m. UTC
This adds support for the Nintendo 64 console's sound.

The sound DMA unit has errata on certain alignments, which is why
we can't use alsa's DMA buffer directly.

Signed-off-by: Lauri Kasanen <cand@gmx.com>
---

v2:
s/to_uncac/ckseg1/

 sound/mips/Kconfig   |   7 ++
 sound/mips/Makefile  |   1 +
 sound/mips/snd-n64.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 305 insertions(+)
 create mode 100644 sound/mips/snd-n64.c

--
2.6.2

Comments

Takashi Iwai Jan. 8, 2021, 9:06 a.m. UTC | #1
Hi,

thanks for the patch.  Now I started reviewing.  Some comments below.

On Fri, 08 Jan 2021 09:35:13 +0100,
Lauri Kasanen wrote:
> 
> This adds support for the Nintendo 64 console's sound.
> 
> The sound DMA unit has errata on certain alignments, which is why
> we can't use alsa's DMA buffer directly.

It's worth to mention this in the source code, too, before later
readers wonder it again.

> diff --git a/sound/mips/snd-n64.c b/sound/mips/snd-n64.c
> new file mode 100644
> index 0000000..0317fe2
> --- /dev/null
> +++ b/sound/mips/snd-n64.c
> @@ -0,0 +1,297 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *   Sound driver for Nintendo 64.
> + *
> + *   Copyright 2020 Lauri Kasanen

We moved forward, a happy new year :)

> +#define REG_BASE ((u32 *) CKSEG1ADDR(0x4500000))

Better to put __iomem?

> +#define MI_REG_BASE ((u32 *) CKSEG1ADDR(0x4300000))

Ditto.

> +struct n64audio_t {

We usually don't put _t suffix for a struct name.
It's only for typedefs.  So, just use struct n64audio.

> +static void n64audio_push(struct n64audio_t *priv, uint8_t irq)

This irq is a boolean flag, no?  Then use bool to be clearer.

> +static irqreturn_t n64audio_isr(int irq, void *dev_id)
> +{
> +	struct n64audio_t *priv = dev_id;
> +
> +	// Check it's ours
> +	const u32 intrs = n64mi_read_reg(MI_INTR_REG);

checkpatch would complain the blank line above (and the lack of it in
below).

> +	if (!(intrs & MI_INTR_AI))
> +		return IRQ_NONE;
> +
> +	n64audio_write_reg(AI_STATUS_REG, 1);
> +
> +	n64audio_push(priv, 1);
> +	snd_pcm_period_elapsed(priv->chan.substream);

It might be safer to check whether the stream is really present and
running.

> +
> +	return IRQ_HANDLED;
> +}
> +
> +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> +	.info = (SNDRV_PCM_INFO_MMAP |
> +		 SNDRV_PCM_INFO_MMAP_VALID |
> +		 SNDRV_PCM_INFO_INTERLEAVED |
> +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> +	.rates =            SNDRV_PCM_RATE_8000_48000,
> +	.rate_min =         8000,
> +	.rate_max =         48000,
> +	.channels_min =     2,
> +	.channels_max =     2,
> +	.buffer_bytes_max = 32768,
> +	.period_bytes_min = 1024,
> +	.period_bytes_max = 32768,
> +	.periods_min =      1,

periods_min=1 makes little sense for this driver.

> +	.periods_max =      128,
> +};
> +
> +static int n64audio_pcm_open(struct snd_pcm_substream *substream)
> +{
> +	struct snd_pcm_runtime *runtime = substream->runtime;
> +
> +	runtime->hw = n64audio_pcm_hw;
> +	return 0;

You likely need to set up more hw constraints.
For example, unless you set the integer periods, API allows unaligned
buffer sizes, i.e. period=1024 and buffer=2500, and it screws up the
transfer for this driver implementation.

> +static int n64audio_pcm_prepare(struct snd_pcm_substream *substream)
> +{
....
> +	spin_lock_irqsave(&priv->chan.lock, flags);

The prepare callback is always non-atomic, so you can use
spin_lock_irq() here.

> +static int n64audio_pcm_close(struct snd_pcm_substream *substream)
> +{
> +	return 0; // Nothing to do, but the kernel crashes if close() doesn't exist

You may clear the substream pointer, for example.  Then ISR might be
able to avoid accessing something wrong if it were triggered
mistakenly.

> +static int __init n64audio_probe(struct platform_device *pdev)

Usually the probe callback itself shouldn't be __init.

> +{
> +	struct snd_card *card;
> +	struct snd_pcm *pcm;
> +	struct n64audio_t *priv;
> +	int err;
> +
> +	err = snd_card_new(&pdev->dev, SNDRV_DEFAULT_IDX1,
> +			   SNDRV_DEFAULT_STR1,
> +			   THIS_MODULE, 0, &card);
> +	if (err < 0)
> +		return err;
> +
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (priv == NULL) {
> +		err = -ENOMEM;
> +		goto fail_card;
> +	}

Note that card->private_data won't be released automatically.
For this kind of allocation, an easier way would be to pass
sizeof(*priv) in snd_card_new() call.  Then card->private_data points
to the allocated data and released altogether with snd_card_free().

> +
> +	priv->card = card;
> +	priv->ring_base = dma_alloc_coherent(card->dev, 32 * 1024,
> +					     &priv->ring_base_dma,
> +					     GFP_DMA | GFP_KERNEL);

There is no release for this?  I guess you need to define
card->private_free to do that or use devm stuff.

> +	if (!priv->ring_base)
> +		goto fail_alloc;
> +
> +	if (request_irq(RCP_IRQ, n64audio_isr,
> +				IRQF_SHARED, "N64 Audio", priv)) {
> +		err = -EBUSY;
> +		goto fail_alloc;
> +	}

Ditto, this needs the free_irq in card->private_free or devm.

> +
> +	spin_lock_init(&priv->chan.lock);

The initialization of spinlock must be done before registering the
ISR.   Do this at the very beginning.

> +static int __init n64audio_init(void)
> +{
> +	int ret;
> +
> +	ret = platform_driver_probe(&n64audio_driver, n64audio_probe);
> +
> +	return ret;

This can simplified.

> +fs_initcall(n64audio_init);

Does it have to be this initcall?


thanks,

Takashi
Lauri Kasanen Jan. 8, 2021, 10:13 a.m. UTC | #2
On Fri, 08 Jan 2021 10:06:48 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > +static int __init n64audio_probe(struct platform_device *pdev)
>
> Usually the probe callback itself shouldn't be __init.
>
> > +fs_initcall(n64audio_init);
>
> Does it have to be this initcall?

It could be module init instead, nothing specific in fs_initcall. It's
__init because it can only be compiled in, and removing that run-once
code saves RAM. The target only has 8mb RAM.

- Lauri
Lauri Kasanen Jan. 9, 2021, 7:23 a.m. UTC | #3
Hi,

On Fri, 08 Jan 2021 10:06:48 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> > +	.info = (SNDRV_PCM_INFO_MMAP |
> > +		 SNDRV_PCM_INFO_MMAP_VALID |
> > +		 SNDRV_PCM_INFO_INTERLEAVED |
> > +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> > +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> > +	.rates =            SNDRV_PCM_RATE_8000_48000,
> > +	.rate_min =         8000,
> > +	.rate_max =         48000,
> > +	.channels_min =     2,
> > +	.channels_max =     2,
> > +	.buffer_bytes_max = 32768,
> > +	.period_bytes_min = 1024,
> > +	.period_bytes_max = 32768,
> > +	.periods_min =      1,
>
> periods_min=1 makes little sense for this driver.

I have some questions about this.

When I had periods_min = 128, OSS apps were broken. I mean simple ones,
open /dev/dsp, ioctl the format/rate/stereo, write data. They got an IO
error errno IIRC, and no clarifying error in dmesg.

I tried following the error with printks, several levels deep. I gave
up when it got to the constraint resolving function, and there was no
good way to print and track which constraint failed, why, and who set
the constraint.

Only through blind guessing did I stumble upon periods_min.

- why did it break OSS apps?
- how does the OSS layer interact with periods? I didn't find any "set
period" ioctl
- why was there no clarifying error in dmesg? Just an errno that means
a million things makes it impossible for the userspace app writer to
know why it's not working

- Lauri
Takashi Iwai Jan. 9, 2021, 8:16 a.m. UTC | #4
On Sat, 09 Jan 2021 08:23:03 +0100,
Lauri Kasanen wrote:
> 
> Hi,
> 
> On Fri, 08 Jan 2021 10:06:48 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > > +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> > > +	.info = (SNDRV_PCM_INFO_MMAP |
> > > +		 SNDRV_PCM_INFO_MMAP_VALID |
> > > +		 SNDRV_PCM_INFO_INTERLEAVED |
> > > +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> > > +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> > > +	.rates =            SNDRV_PCM_RATE_8000_48000,
> > > +	.rate_min =         8000,
> > > +	.rate_max =         48000,
> > > +	.channels_min =     2,
> > > +	.channels_max =     2,
> > > +	.buffer_bytes_max = 32768,
> > > +	.period_bytes_min = 1024,
> > > +	.period_bytes_max = 32768,
> > > +	.periods_min =      1,
> >
> > periods_min=1 makes little sense for this driver.
> 
> I have some questions about this.
> 
> When I had periods_min = 128, OSS apps were broken. I mean simple ones,
> open /dev/dsp, ioctl the format/rate/stereo, write data. They got an IO
> error errno IIRC, and no clarifying error in dmesg.
> 
> I tried following the error with printks, several levels deep. I gave
> up when it got to the constraint resolving function, and there was no
> good way to print and track which constraint failed, why, and who set
> the constraint.

Did you try to set up the hw constraint for the integer PERIODS like
below at open?
  snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS)

Without this, it'd allow inconsistent buffer/period set up in your
case.

> Only through blind guessing did I stumble upon periods_min.

The periods_min usually defines the hardware/software limit of the
interrupt transfer.

> - why did it break OSS apps?
> - how does the OSS layer interact with periods? I didn't find any "set
> period" ioctl

OSS layers do the same as the native API via OSS emulation in
sound/core/oss/pcm*.c.

> - why was there no clarifying error in dmesg? Just an errno that means
> a million things makes it impossible for the userspace app writer to
> know why it's not working

Did you check the debug messages with dyndbg enabled?


Takashi
Lauri Kasanen Jan. 9, 2021, 5:46 p.m. UTC | #5
On Sat, 09 Jan 2021 09:16:08 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > > > +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> > > > +	.info = (SNDRV_PCM_INFO_MMAP |
> > > > +		 SNDRV_PCM_INFO_MMAP_VALID |
> > > > +		 SNDRV_PCM_INFO_INTERLEAVED |
> > > > +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> > > > +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> > > > +	.rates =            SNDRV_PCM_RATE_8000_48000,
> > > > +	.rate_min =         8000,
> > > > +	.rate_max =         48000,
> > > > +	.channels_min =     2,
> > > > +	.channels_max =     2,
> > > > +	.buffer_bytes_max = 32768,
> > > > +	.period_bytes_min = 1024,
> > > > +	.period_bytes_max = 32768,
> > > > +	.periods_min =      1,
> > >
> > > periods_min=1 makes little sense for this driver.
> >
> > I have some questions about this.
> >
> > When I had periods_min = 128, OSS apps were broken. I mean simple ones,
> > open /dev/dsp, ioctl the format/rate/stereo, write data. They got an IO
> > error errno IIRC, and no clarifying error in dmesg.
> >
> > I tried following the error with printks, several levels deep. I gave
> > up when it got to the constraint resolving function, and there was no
> > good way to print and track which constraint failed, why, and who set
> > the constraint.
>
> Did you try to set up the hw constraint for the integer PERIODS like
> below at open?
>   snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS)
>
> Without this, it'd allow inconsistent buffer/period set up in your
> case.

No, not yet. But surely an inconsistent buffer size would still play
something, instead of immediately erroring out?

> > Only through blind guessing did I stumble upon periods_min.
>
> The periods_min usually defines the hardware/software limit of the
> interrupt transfer.

Why do you say periods_min=1 makes little sense? At 44.1 khz, that'd be
172 interrupts per second, which is a lot but workable? There is no hw
limit against 172 irqs/s.

> > - why was there no clarifying error in dmesg? Just an errno that means
> > a million things makes it impossible for the userspace app writer to
> > know why it's not working
>
> Did you check the debug messages with dyndbg enabled?

No, CONFIG_DYNAMIC_DEBUG, CONFIG_DEBUG_FS and CONFIG_SND_DEBUG are all
disabled because this is a memory-constrained platform. Surely "why my
app is not producing sound" is not something that needs several
different kernel debug options enabled (+ root perms b/c debugfs).

- Lauri
Takashi Iwai Jan. 9, 2021, 6:17 p.m. UTC | #6
On Sat, 09 Jan 2021 18:46:01 +0100,
Lauri Kasanen wrote:
> 
> On Sat, 09 Jan 2021 09:16:08 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > > > > +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> > > > > +	.info = (SNDRV_PCM_INFO_MMAP |
> > > > > +		 SNDRV_PCM_INFO_MMAP_VALID |
> > > > > +		 SNDRV_PCM_INFO_INTERLEAVED |
> > > > > +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> > > > > +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> > > > > +	.rates =            SNDRV_PCM_RATE_8000_48000,
> > > > > +	.rate_min =         8000,
> > > > > +	.rate_max =         48000,
> > > > > +	.channels_min =     2,
> > > > > +	.channels_max =     2,
> > > > > +	.buffer_bytes_max = 32768,
> > > > > +	.period_bytes_min = 1024,
> > > > > +	.period_bytes_max = 32768,
> > > > > +	.periods_min =      1,
> > > >
> > > > periods_min=1 makes little sense for this driver.
> > >
> > > I have some questions about this.
> > >
> > > When I had periods_min = 128, OSS apps were broken. I mean simple ones,
> > > open /dev/dsp, ioctl the format/rate/stereo, write data. They got an IO
> > > error errno IIRC, and no clarifying error in dmesg.
> > >
> > > I tried following the error with printks, several levels deep. I gave
> > > up when it got to the constraint resolving function, and there was no
> > > good way to print and track which constraint failed, why, and who set
> > > the constraint.
> >
> > Did you try to set up the hw constraint for the integer PERIODS like
> > below at open?
> >   snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS)
> >
> > Without this, it'd allow inconsistent buffer/period set up in your
> > case.
> 
> No, not yet. But surely an inconsistent buffer size would still play
> something, instead of immediately erroring out?

In some cases, it's possible.  PCM OSS translation has some special
way depending on the period ("fragment" in OSS) setup...

> > > Only through blind guessing did I stumble upon periods_min.
> >
> > The periods_min usually defines the hardware/software limit of the
> > interrupt transfer.
> 
> Why do you say periods_min=1 makes little sense? At 44.1 khz, that'd be
> 172 interrupts per second, which is a lot but workable? There is no hw
> limit against 172 irqs/s.

Well, it's not about the sample rate or the process speed.  You need
to know what periods=1 means.  periods=1 is a VERY special usage.  No
double buffering and the driver has to report the precise accurate
position without period updates; i.e. it's almost for free-wheeling
DMA transfer.  Hence periods_min=1 makes sense if the driver may
behave like that.

> > > - why was there no clarifying error in dmesg? Just an errno that means
> > > a million things makes it impossible for the userspace app writer to
> > > know why it's not working
> >
> > Did you check the debug messages with dyndbg enabled?
> 
> No, CONFIG_DYNAMIC_DEBUG, CONFIG_DEBUG_FS and CONFIG_SND_DEBUG are all
> disabled because this is a memory-constrained platform. Surely "why my
> app is not producing sound" is not something that needs several
> different kernel debug options enabled (+ root perms b/c debugfs).

But you are debugging the *kernel* problem, not the application.
I agree that debugfs isn't always needed for hunting application bugs,
yeah.


Takashi
Takashi Iwai Jan. 9, 2021, 8:54 p.m. UTC | #7
On Sat, 09 Jan 2021 19:17:16 +0100,
Takashi Iwai wrote:
> 
> On Sat, 09 Jan 2021 18:46:01 +0100,
> Lauri Kasanen wrote:
> > 
> > On Sat, 09 Jan 2021 09:16:08 +0100
> > Takashi Iwai <tiwai@suse.de> wrote:
> > 
> > > > > > +static const struct snd_pcm_hardware n64audio_pcm_hw = {
> > > > > > +	.info = (SNDRV_PCM_INFO_MMAP |
> > > > > > +		 SNDRV_PCM_INFO_MMAP_VALID |
> > > > > > +		 SNDRV_PCM_INFO_INTERLEAVED |
> > > > > > +		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
> > > > > > +	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
> > > > > > +	.rates =            SNDRV_PCM_RATE_8000_48000,
> > > > > > +	.rate_min =         8000,
> > > > > > +	.rate_max =         48000,
> > > > > > +	.channels_min =     2,
> > > > > > +	.channels_max =     2,
> > > > > > +	.buffer_bytes_max = 32768,
> > > > > > +	.period_bytes_min = 1024,
> > > > > > +	.period_bytes_max = 32768,
> > > > > > +	.periods_min =      1,
> > > > >
> > > > > periods_min=1 makes little sense for this driver.
> > > >
> > > > I have some questions about this.
> > > >
> > > > When I had periods_min = 128, OSS apps were broken. I mean simple ones,
> > > > open /dev/dsp, ioctl the format/rate/stereo, write data. They got an IO
> > > > error errno IIRC, and no clarifying error in dmesg.
> > > >
> > > > I tried following the error with printks, several levels deep. I gave
> > > > up when it got to the constraint resolving function, and there was no
> > > > good way to print and track which constraint failed, why, and who set
> > > > the constraint.
> > >
> > > Did you try to set up the hw constraint for the integer PERIODS like
> > > below at open?
> > >   snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS)
> > >
> > > Without this, it'd allow inconsistent buffer/period set up in your
> > > case.
> > 
> > No, not yet. But surely an inconsistent buffer size would still play
> > something, instead of immediately erroring out?
> 
> In some cases, it's possible.  PCM OSS translation has some special
> way depending on the period ("fragment" in OSS) setup...
> 
> > > > Only through blind guessing did I stumble upon periods_min.
> > >
> > > The periods_min usually defines the hardware/software limit of the
> > > interrupt transfer.
> > 
> > Why do you say periods_min=1 makes little sense? At 44.1 khz, that'd be
> > 172 interrupts per second, which is a lot but workable? There is no hw
> > limit against 172 irqs/s.
> 
> Well, it's not about the sample rate or the process speed.  You need
> to know what periods=1 means.  periods=1 is a VERY special usage.  No
> double buffering and the driver has to report the precise accurate
> position without period updates; i.e. it's almost for free-wheeling
> DMA transfer.  Hence periods_min=1 makes sense if the driver may
> behave like that.

And, after reading the patch again, I found another issue that is
relevant with this. 

The function transfers the data chunk of the period size is
implemented as:

static void n64audio_push(struct n64audio_t *priv, uint8_t irq)
{
	....
	memcpy(priv->ring_base, runtime->dma_area + priv->chan.nextpos, count);

	n64audio_write_reg(AI_ADDR_REG, priv->ring_base_dma);
	n64audio_write_reg(AI_LEN_REG, count);

	priv->chan.nextpos += count;
	priv->chan.nextpos %= priv->chan.bufsize;
	if (irq)
		priv->chan.pos = priv->chan.nextpos;

... and this is called from two places, the interrupt handler:

static irqreturn_t n64audio_isr(int irq, void *dev_id)
{
	....
	n64audio_push(priv, 1);
	snd_pcm_period_elapsed(priv->chan.substream);

... and the trigger START:

static int n64audio_pcm_trigger(struct snd_pcm_substream *substream,
				int cmd)
{
	switch (cmd) {
	case SNDRV_PCM_TRIGGER_START:
		n64audio_push(substream->pcm->private_data, 0);

Meanwhile the pointer callback returns the chan.pos:

static snd_pcm_uframes_t n64audio_pcm_pointer(struct snd_pcm_substream *substream)
{
	....
	return bytes_to_frames(substream->runtime,
			       priv->chan.pos);

When the start starts, it copies the full period size data, and moves
nextpos to period size while keeping pos 0.  And, at this moment, it's
even possible that no enough data has been filled for the period
size; this is practically a buffer underrun.
Usually PCM core can catch the buffer underrun by comparing the
current position reported by the pointer callback against the filled
size, but in this case, PCM core can't know of it because the driver
just tells the position 0.  This is one problem.

Then, at the first period IRQ, the next period is copied, then nextpos
becomes 2*period_size.  At this moment, pos = nextpos, hence it jumps
from 0 to 2*periodsize out of sudden.  It's quite confusing behavior
for applications.  That's the second problem.

I guess that both problems could be avoided if n64audio_pointer()
reports always nextpos instead of pos.  And, the driver may set
runtime->delay value to correct back for the period size; ideally this 
should correct to the actual playback position, but since we can't
read it, nothing but a constant max correction (= period size) is
applicable.


Takashi
Lauri Kasanen Jan. 10, 2021, 7:15 a.m. UTC | #8
On Sat, 09 Jan 2021 21:54:12 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> When the start starts, it copies the full period size data, and moves
> nextpos to period size while keeping pos 0.  And, at this moment, it's
> even possible that no enough data has been filled for the period
> size; this is practically a buffer underrun.
> Usually PCM core can catch the buffer underrun by comparing the
> current position reported by the pointer callback against the filled
> size, but in this case, PCM core can't know of it because the driver
> just tells the position 0.  This is one problem.
>
> Then, at the first period IRQ, the next period is copied, then nextpos
> becomes 2*period_size.  At this moment, pos = nextpos, hence it jumps
> from 0 to 2*periodsize out of sudden.  It's quite confusing behavior
> for applications.  That's the second problem.
>
> I guess that both problems could be avoided if n64audio_pointer()
> reports always nextpos instead of pos.

At first there was no nextpos, and _pointer() always reported pos. This
didn't work, the core played through the audio at chipmunk speed. So
there must be more that I don't understand here.

Let me describe the hw, perhaps a different approach would be better.
- the DMA unit requires 8-byte alignment and 8-divisible size (two
frames at the only supported format, s16be stereo)
- the DMA unit has errata if (start + len) & 0x3fff == 0x2000, this
must never happen
- the audio IRQ is not a timer, it fires when the card's internal
buffers are empty and need immediate refill

- Lauri
Takashi Iwai Jan. 10, 2021, 10:24 a.m. UTC | #9
On Sun, 10 Jan 2021 08:15:36 +0100,
Lauri Kasanen wrote:
> 
> On Sat, 09 Jan 2021 21:54:12 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > When the start starts, it copies the full period size data, and moves
> > nextpos to period size while keeping pos 0.  And, at this moment, it's
> > even possible that no enough data has been filled for the period
> > size; this is practically a buffer underrun.
> > Usually PCM core can catch the buffer underrun by comparing the
> > current position reported by the pointer callback against the filled
> > size, but in this case, PCM core can't know of it because the driver
> > just tells the position 0.  This is one problem.
> >
> > Then, at the first period IRQ, the next period is copied, then nextpos
> > becomes 2*period_size.  At this moment, pos = nextpos, hence it jumps
> > from 0 to 2*periodsize out of sudden.  It's quite confusing behavior
> > for applications.  That's the second problem.
> >
> > I guess that both problems could be avoided if n64audio_pointer()
> > reports always nextpos instead of pos.
> 
> At first there was no nextpos, and _pointer() always reported pos. This
> didn't work, the core played through the audio at chipmunk speed. So
> there must be more that I don't understand here.

Try to set the periods_min=2 and the integer periods hw constraint at
first, and change the pointer callback to return nextpos.  Also, at
the push function, set runtime->delay = period_size as well.

> Let me describe the hw, perhaps a different approach would be better.
> - the DMA unit requires 8-byte alignment and 8-divisible size (two
> frames at the only supported format, s16be stereo)

This restriction needs to be set up as another hw constraint as long
as you use the period size as the transfer size.

> - the DMA unit has errata if (start + len) & 0x3fff == 0x2000, this
> must never happen

Ditto.

> - the audio IRQ is not a timer, it fires when the card's internal
> buffers are empty and need immediate refill

It's the current implementation, right?


Takashi
Lauri Kasanen Jan. 10, 2021, 5:03 p.m. UTC | #10
On Sun, 10 Jan 2021 11:24:22 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > At first there was no nextpos, and _pointer() always reported pos. This
> > didn't work, the core played through the audio at chipmunk speed. So
> > there must be more that I don't understand here.
>
> Try to set the periods_min=2 and the integer periods hw constraint at
> first, and change the pointer callback to return nextpos.  Also, at
> the push function, set runtime->delay = period_size as well.

When I do all this, it still causes the chipmunk speed. Several seconds
of audio gets played in 0.3s or so. Sorry if this is taking too much of
your time, I'm a bit lost here at what the alsa core is expecting.

Printks show the following repeats:
start, period size 1024
push, bool irq=0
irq fired
push, bool irq=1
pointer at 8192
stop

It stops and starts again for some reason. This does not happen in the
current pos/nextpos implementation.

> > - the DMA unit has errata if (start + len) & 0x3fff == 0x2000, this
> > must never happen
>
> Ditto.

Can it really handle a constraint this complex? That'd be impressive.
It'd remove the memcpy need if so.

> > - the audio IRQ is not a timer, it fires when the card's internal
> > buffers are empty and need immediate refill
>
> It's the current implementation, right?

Yes, but I was wondering if a different setup would work better. The
alsa setup is a bit confusing to me still.

- Lauri
Takashi Iwai Jan. 10, 2021, 5:22 p.m. UTC | #11
On Sun, 10 Jan 2021 18:03:32 +0100,
Lauri Kasanen wrote:
> 
> On Sun, 10 Jan 2021 11:24:22 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > > At first there was no nextpos, and _pointer() always reported pos. This
> > > didn't work, the core played through the audio at chipmunk speed. So
> > > there must be more that I don't understand here.
> >
> > Try to set the periods_min=2 and the integer periods hw constraint at
> > first, and change the pointer callback to return nextpos.  Also, at
> > the push function, set runtime->delay = period_size as well.
> 
> When I do all this, it still causes the chipmunk speed. Several seconds
> of audio gets played in 0.3s or so. Sorry if this is taking too much of
> your time, I'm a bit lost here at what the alsa core is expecting.
> 
> Printks show the following repeats:
> start, period size 1024
> push, bool irq=0
> irq fired
> push, bool irq=1
> pointer at 8192
> stop

Hm, is the above about the result with the pointer callback returning
pos, not nextpos?  If so,

> start, period size 1024
> push, bool irq=0

At this moment, nextpos is 1024, and it should take some time until

> irq fired

... this IRQ is triggered; it must be the period time.
Was the reported timing as expected?

> push, bool irq=1
> pointer at 8192

If it's the first IRQ, isn't the nextpos 2048?

> stop

It's stopped likely because the position reported the driver is beyond
the data size the application filled (XRUN).

> It stops and starts again for some reason. This does not happen in the
> current pos/nextpos implementation.
> 
> > > - the DMA unit has errata if (start + len) & 0x3fff == 0x2000, this
> > > must never happen
> >
> > Ditto.
> 
> Can it really handle a constraint this complex? That'd be impressive.
> It'd remove the memcpy need if so.

The hw constraint can be just a function that filters the min/max
value the driver may accept, so such a rule is possible.

> > > - the audio IRQ is not a timer, it fires when the card's internal
> > > buffers are empty and need immediate refill
> >
> > It's the current implementation, right?
> 
> Yes, but I was wondering if a different setup would work better. The
> alsa setup is a bit confusing to me still.

The implementation became a bit complex due to the hardware
restriction, yeah.


thanks,

Takashi
Lauri Kasanen Jan. 10, 2021, 5:41 p.m. UTC | #12
On Sun, 10 Jan 2021 18:22:50 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> On Sun, 10 Jan 2021 18:03:32 +0100,
> Lauri Kasanen wrote:
> > On Sun, 10 Jan 2021 11:24:22 +0100
> > Takashi Iwai <tiwai@suse.de> wrote:
> >
> > > > At first there was no nextpos, and _pointer() always reported pos. This
> > > > didn't work, the core played through the audio at chipmunk speed. So
> > > > there must be more that I don't understand here.
> > >
> > > Try to set the periods_min=2 and the integer periods hw constraint at
> > > first, and change the pointer callback to return nextpos.  Also, at
> > > the push function, set runtime->delay = period_size as well.
> >
> > When I do all this, it still causes the chipmunk speed. Several seconds
> > of audio gets played in 0.3s or so. Sorry if this is taking too much of
> > your time, I'm a bit lost here at what the alsa core is expecting.
> >
> > Printks show the following repeats:
> > start, period size 1024
> > push, bool irq=0
> > irq fired
> > push, bool irq=1
> > pointer at 8192
> > stop
>
> Hm, is the above about the result with the pointer callback returning
> pos, not nextpos?  If so,

It was returning nextpos, but the pointer printk was in bytes. 8192
bytes = 2048 frames.

> > start, period size 1024
> > push, bool irq=0
>
> At this moment, nextpos is 1024, and it should take some time until
>
> > irq fired
>
> ... this IRQ is triggered; it must be the period time.
> Was the reported timing as expected?

It's roughly correct, but timing is not very precise, as printk itself
has heavy overhead for the 93 MHz cpu.

- Lauri
Takashi Iwai Jan. 11, 2021, 8:05 a.m. UTC | #13
On Sun, 10 Jan 2021 18:41:46 +0100,
Lauri Kasanen wrote:
> 
> On Sun, 10 Jan 2021 18:22:50 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > On Sun, 10 Jan 2021 18:03:32 +0100,
> > Lauri Kasanen wrote:
> > > On Sun, 10 Jan 2021 11:24:22 +0100
> > > Takashi Iwai <tiwai@suse.de> wrote:
> > >
> > > > > At first there was no nextpos, and _pointer() always reported pos. This
> > > > > didn't work, the core played through the audio at chipmunk speed. So
> > > > > there must be more that I don't understand here.
> > > >
> > > > Try to set the periods_min=2 and the integer periods hw constraint at
> > > > first, and change the pointer callback to return nextpos.  Also, at
> > > > the push function, set runtime->delay = period_size as well.
> > >
> > > When I do all this, it still causes the chipmunk speed. Several seconds
> > > of audio gets played in 0.3s or so. Sorry if this is taking too much of
> > > your time, I'm a bit lost here at what the alsa core is expecting.
> > >
> > > Printks show the following repeats:
> > > start, period size 1024
> > > push, bool irq=0
> > > irq fired
> > > push, bool irq=1
> > > pointer at 8192
> > > stop
> >
> > Hm, is the above about the result with the pointer callback returning
> > pos, not nextpos?  If so,
> 
> It was returning nextpos, but the pointer printk was in bytes. 8192
> bytes = 2048 frames.

OK, then it must be right.

Then I suppose that the update of pos should be changed in a different
way; it should always point to the previous nextpos.  That is,
something like:

static void n64audio_push(struct n64audio_t *priv, uint8_t irq)
{
	....
	if (irq)
		priv->chan.pos = priv->chan.nextpos;
	priv->chan.nextpos += count;
	priv->chan.nextpos %= priv->chan.bufsize;

If we use nextpos as the position, it'll lead to the double steps at
the first IRQ handling without snd_pcm_period_elapsed() call (the
first step missed it), and this may confuse PCM core.


> > > start, period size 1024
> > > push, bool irq=0
> >
> > At this moment, nextpos is 1024, and it should take some time until
> >
> > > irq fired
> >
> > ... this IRQ is triggered; it must be the period time.
> > Was the reported timing as expected?
> 
> It's roughly correct, but timing is not very precise, as printk itself
> has heavy overhead for the 93 MHz cpu.

OK, that sounds good, at least.


thanks,

Takashi
Lauri Kasanen Jan. 11, 2021, 9:43 a.m. UTC | #14
On Mon, 11 Jan 2021 09:05:04 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> On Sun, 10 Jan 2021 18:41:46 +0100,
> Lauri Kasanen wrote:
> > It was returning nextpos, but the pointer printk was in bytes. 8192
> > bytes = 2048 frames.
>
> OK, then it must be right.
>
> Then I suppose that the update of pos should be changed in a different
> way; it should always point to the previous nextpos.  That is,
> something like:
>
> static void n64audio_push(struct n64audio_t *priv, uint8_t irq)
> {
> 	....
> 	if (irq)
> 		priv->chan.pos = priv->chan.nextpos;
> 	priv->chan.nextpos += count;
> 	priv->chan.nextpos %= priv->chan.bufsize;
>
> If we use nextpos as the position, it'll lead to the double steps at
> the first IRQ handling without snd_pcm_period_elapsed() call (the
> first step missed it), and this may confuse PCM core.

This almost works, speed is correct, but the last part is played twice.

I wonder if the first, non-irq push should just push a silent buffer,
and not update pos or nextpos at all.

- Lauri
Takashi Iwai Jan. 11, 2021, 10:11 a.m. UTC | #15
On Mon, 11 Jan 2021 10:43:23 +0100,
Lauri Kasanen wrote:
> 
> On Mon, 11 Jan 2021 09:05:04 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > On Sun, 10 Jan 2021 18:41:46 +0100,
> > Lauri Kasanen wrote:
> > > It was returning nextpos, but the pointer printk was in bytes. 8192
> > > bytes = 2048 frames.
> >
> > OK, then it must be right.
> >
> > Then I suppose that the update of pos should be changed in a different
> > way; it should always point to the previous nextpos.  That is,
> > something like:
> >
> > static void n64audio_push(struct n64audio_t *priv, uint8_t irq)
> > {
> > 	....
> > 	if (irq)
> > 		priv->chan.pos = priv->chan.nextpos;
> > 	priv->chan.nextpos += count;
> > 	priv->chan.nextpos %= priv->chan.bufsize;
> >
> > If we use nextpos as the position, it'll lead to the double steps at
> > the first IRQ handling without snd_pcm_period_elapsed() call (the
> > first step missed it), and this may confuse PCM core.
> 
> This almost works, speed is correct, but the last part is played twice.

Oh yes, at the last IRQ, the push should be avoided.
I guess that the code order should be changed to the following way:

  1. advance the position for a period size
  2. call snd_pcm_period_elapsed()
  3. check if the stream is still running
  4. copy the next chunk and update nextpos

So, it's something like:

static irqreturn_t n64audio_isr(int irq, void *dev_id)
{
	struct n64audio *priv = dev_id;

	// Check it's ours
	const u32 intrs = n64mi_read_reg(MI_INTR_REG);
	if (!(intrs & MI_INTR_AI))
		return IRQ_NONE;

	n64audio_write_reg(AI_STATUS_REG, 1);

	priv->chan.pos += priv->chan.nextpos;
	snd_pcm_period_elapsed(priv->chan.substream);
	if (priv->chan.substream == SNDRV_PCM_STATE_RUNNING)
		n64audio_push(priv);

	return IRQ_HANDLED;
}

By calling snd_pcm_period_elapsed(), PCM core detects the end of the
stream and it stops with the state change.  Then we can avoid the
unnecessary push after the stop.

The irq argument in n64audio_push() is dropped in the above, as it's
superfluous now.


Takashi
Lauri Kasanen Jan. 11, 2021, 12:02 p.m. UTC | #16
On Mon, 11 Jan 2021 11:11:39 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > This almost works, speed is correct, but the last part is played twice.
>
> Oh yes, at the last IRQ, the push should be avoided.
> I guess that the code order should be changed to the following way:
>
>   1. advance the position for a period size
>   2. call snd_pcm_period_elapsed()
>   3. check if the stream is still running
>   4. copy the next chunk and update nextpos

This order gives correct pointer advancing etc, but now it's hitting a
new problem: the pcm core is reusing the buffer from under the audio
card. It's writing new data to the area that is currently being read by
DMA.

I assume the core expects DMA to be instant, but in this card's case
it's ondemand, reading bytes as needed.

By restoring the memcpy buffer, I get good audio with this new order
(sans occasional crackling due to the memcpy taking too long).

- Lauri
Takashi Iwai Jan. 11, 2021, 3:25 p.m. UTC | #17
On Mon, 11 Jan 2021 13:02:22 +0100,
Lauri Kasanen wrote:
> 
> On Mon, 11 Jan 2021 11:11:39 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > > This almost works, speed is correct, but the last part is played twice.
> >
> > Oh yes, at the last IRQ, the push should be avoided.
> > I guess that the code order should be changed to the following way:
> >
> >   1. advance the position for a period size
> >   2. call snd_pcm_period_elapsed()
> >   3. check if the stream is still running
> >   4. copy the next chunk and update nextpos
> 
> This order gives correct pointer advancing etc, but now it's hitting a
> new problem: the pcm core is reusing the buffer from under the audio
> card. It's writing new data to the area that is currently being read by
> DMA.

Could you elaborate?  I still don't get what's going on there.

> I assume the core expects DMA to be instant, but in this card's case
> it's ondemand, reading bytes as needed.

No, PCM core doesn't expect it.  The only expectation is that the data
from the current pointer to the next period boundary will be
transferred until the next period-elapsed call.

> By restoring the memcpy buffer, I get good audio with this new order
> (sans occasional crackling due to the memcpy taking too long).

The overwriting problem has been already present in the original patch
version as I already argued.  The driver copies the full next period
to be updated, but this is never guaranteed to have been filled by the
application.  Maybe this didn't surface obviously with the original
version because it essentially gives one more period available on the
buffer -- i.e. it might be a matter of number of periods.


Takashi
Lauri Kasanen Jan. 11, 2021, 3:51 p.m. UTC | #18
On Mon, 11 Jan 2021 16:25:08 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> On Mon, 11 Jan 2021 13:02:22 +0100,
> Lauri Kasanen wrote:
> >
> > On Mon, 11 Jan 2021 11:11:39 +0100
> > Takashi Iwai <tiwai@suse.de> wrote:
> > > Oh yes, at the last IRQ, the push should be avoided.
> > > I guess that the code order should be changed to the following way:
> > >
> > >   1. advance the position for a period size
> > >   2. call snd_pcm_period_elapsed()
> > >   3. check if the stream is still running
> > >   4. copy the next chunk and update nextpos
> >
> > This order gives correct pointer advancing etc, but now it's hitting a
> > new problem: the pcm core is reusing the buffer from under the audio
> > card. It's writing new data to the area that is currently being read by
> > DMA.
>
> Could you elaborate?  I still don't get what's going on there.

The audio plays partially out of order. At the point when the IRQ
fires, the data for the next period is correct, but it becomes
incorrect during playing it.

This is clear because if I memcpy it in the isr, like in the first
patch, the playback is correct. When copied to a private buffer, the
pcm core can't overwrite it when the card is reading it.

However the playback speed is correct, and when I print out the events,
the pointer advances correctly, one period at a time, and there are no
continuous stops and restarts.

> > I assume the core expects DMA to be instant, but in this card's case
> > it's ondemand, reading bytes as needed.
>
> No, PCM core doesn't expect it.  The only expectation is that the data
> from the current pointer to the next period boundary will be
> transferred until the next period-elapsed call.

Curious. This problem shouldn't exist then if everything is correct.
I'm thankful for your help in solving these; I haven't done alsa
hacking before.

In the current code, everything should be in sync with how you wrote.

period size 1026 frames
irq fires for the first time, pos becomes 1026*4=4104 bytes
snd_pcm_period_elapsed gets called, pointer returns 4104/4 frames
stream is running, so we call push()
push() starts playing at 4104 bytes into alsa's dma buffer

So the driver is playing the second period, and the core shouldn't be
writing into it as the pointer points to the start of that period.

> > By restoring the memcpy buffer, I get good audio with this new order
> > (sans occasional crackling due to the memcpy taking too long).
>
> The overwriting problem has been already present in the original patch
> version as I already argued.  The driver copies the full next period
> to be updated, but this is never guaranteed to have been filled by the
> application.  Maybe this didn't surface obviously with the original
> version because it essentially gives one more period available on the
> buffer -- i.e. it might be a matter of number of periods.

Yes, this startup issue was present, but I believe it's different to
the current issue. I could be wrong though.

- Lauri
Lauri Kasanen Jan. 13, 2021, 11:57 a.m. UTC | #19
On Mon, 11 Jan 2021 16:25:08 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> On Mon, 11 Jan 2021 13:02:22 +0100,
> Lauri Kasanen wrote:
> > This order gives correct pointer advancing etc, but now it's hitting a
> > new problem: the pcm core is reusing the buffer from under the audio
> > card. It's writing new data to the area that is currently being read by
> > DMA.
>
> Could you elaborate?  I still don't get what's going on there.

I figured it out. Turns out the hw registers were double-buffered in a
way that requires two periods' worth of buffers. The IRQ fires when one
buffer is finished and another is queued, not when everything is
finished as I first thought.

There doesn't seem to be a way to request the PCM core to keep two
periods' distance instead of one? I will deploy memcpy then.

- Lauri
Takashi Iwai Jan. 13, 2021, 12:04 p.m. UTC | #20
On Wed, 13 Jan 2021 12:57:16 +0100,
Lauri Kasanen wrote:
> 
> On Mon, 11 Jan 2021 16:25:08 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > On Mon, 11 Jan 2021 13:02:22 +0100,
> > Lauri Kasanen wrote:
> > > This order gives correct pointer advancing etc, but now it's hitting a
> > > new problem: the pcm core is reusing the buffer from under the audio
> > > card. It's writing new data to the area that is currently being read by
> > > DMA.
> >
> > Could you elaborate?  I still don't get what's going on there.
> 
> I figured it out. Turns out the hw registers were double-buffered in a
> way that requires two periods' worth of buffers. The IRQ fires when one
> buffer is finished and another is queued, not when everything is
> finished as I first thought.
> 
> There doesn't seem to be a way to request the PCM core to keep two
> periods' distance instead of one? I will deploy memcpy then.

We may return to the first approach, i.e. just use nextpos.  But then
snd_pcm_period_elapsed() has to be called right after the trigger
callback without the IRQ, because the trigger START already queued the
full period and the position advances.  So the first period-elapsed
has to be called from a work or such offload instead of IRQ.
In anyway, it's a bit tricky, yeah.


Takashi
Lauri Kasanen Jan. 13, 2021, 12:14 p.m. UTC | #21
On Wed, 13 Jan 2021 13:04:50 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> On Wed, 13 Jan 2021 12:57:16 +0100,
> Lauri Kasanen wrote:
> > I figured it out. Turns out the hw registers were double-buffered in a
> > way that requires two periods' worth of buffers. The IRQ fires when one
> > buffer is finished and another is queued, not when everything is
> > finished as I first thought.
> >
> > There doesn't seem to be a way to request the PCM core to keep two
> > periods' distance instead of one? I will deploy memcpy then.
>
> We may return to the first approach, i.e. just use nextpos.  But then
> snd_pcm_period_elapsed() has to be called right after the trigger
> callback without the IRQ, because the trigger START already queued the
> full period and the position advances.  So the first period-elapsed
> has to be called from a work or such offload instead of IRQ.
> In anyway, it's a bit tricky, yeah.

I don't think that would work? When I printed out where
__snd_pcm_lib_xfer wrote data, it only steered clear of the current
period (pointer up to next period size). It wrote in every other part
of the buffer. This means the currently playing period would be racy.

The other point is that memcpy doesn't have a downside now - it doesn't
crackle when properly double-buffered. It's simpler this way vs
workqueues/etc.

- Lauri
Takashi Iwai Jan. 13, 2021, 12:38 p.m. UTC | #22
On Wed, 13 Jan 2021 13:14:53 +0100,
Lauri Kasanen wrote:
> 
> On Wed, 13 Jan 2021 13:04:50 +0100
> Takashi Iwai <tiwai@suse.de> wrote:
> 
> > On Wed, 13 Jan 2021 12:57:16 +0100,
> > Lauri Kasanen wrote:
> > > I figured it out. Turns out the hw registers were double-buffered in a
> > > way that requires two periods' worth of buffers. The IRQ fires when one
> > > buffer is finished and another is queued, not when everything is
> > > finished as I first thought.
> > >
> > > There doesn't seem to be a way to request the PCM core to keep two
> > > periods' distance instead of one? I will deploy memcpy then.
> >
> > We may return to the first approach, i.e. just use nextpos.  But then
> > snd_pcm_period_elapsed() has to be called right after the trigger
> > callback without the IRQ, because the trigger START already queued the
> > full period and the position advances.  So the first period-elapsed
> > has to be called from a work or such offload instead of IRQ.
> > In anyway, it's a bit tricky, yeah.
> 
> I don't think that would work? When I printed out where
> __snd_pcm_lib_xfer wrote data, it only steered clear of the current
> period (pointer up to next period size). It wrote in every other part
> of the buffer. This means the currently playing period would be racy.

By advancing the hwptr, it fetches the full data, and PCM core already
checks whether the data has been filled.  If not filled, it emits the
buffer underrun error to the application.  So it enforces the next
period filled beforehand.

> The other point is that memcpy doesn't have a downside now - it doesn't
> crackle when properly double-buffered. It's simpler this way vs
> workqueues/etc.

I'm fine with the other workaround as long as it works effectively.
It needs more description in the code, though :)


Takashi
Lauri Kasanen Jan. 13, 2021, 12:49 p.m. UTC | #23
On Wed, 13 Jan 2021 13:38:19 +0100
Takashi Iwai <tiwai@suse.de> wrote:

> > The other point is that memcpy doesn't have a downside now - it doesn't
> > crackle when properly double-buffered. It's simpler this way vs
> > workqueues/etc.
>
> I'm fine with the other workaround as long as it works effectively.
> It needs more description in the code, though :)

v4 has a comment about that in push(). Can you review v4 and point out
where it needs more comments?

- Lauri
diff mbox series

Patch

diff --git a/sound/mips/Kconfig b/sound/mips/Kconfig
index b497b80..c8b4db5 100644
--- a/sound/mips/Kconfig
+++ b/sound/mips/Kconfig
@@ -24,5 +24,12 @@  config SND_SGI_HAL2
 	help
 	  Sound support for the SGI Indy and Indigo2 Workstation.

+config SND_N64
+	bool "N64 Audio"
+	depends on MACH_NINTENDO64
+	select SND_PCM
+	help
+	  Sound support for the N64.
+
 endif	# SND_MIPS

diff --git a/sound/mips/Makefile b/sound/mips/Makefile
index ccc364e..7c86268 100644
--- a/sound/mips/Makefile
+++ b/sound/mips/Makefile
@@ -9,3 +9,4 @@  snd-sgi-hal2-objs := hal2.o
 # Toplevel Module Dependency
 obj-$(CONFIG_SND_SGI_O2) += snd-sgi-o2.o
 obj-$(CONFIG_SND_SGI_HAL2) += snd-sgi-hal2.o
+obj-$(CONFIG_SND_N64) += snd-n64.o
diff --git a/sound/mips/snd-n64.c b/sound/mips/snd-n64.c
new file mode 100644
index 0000000..0317fe2
--- /dev/null
+++ b/sound/mips/snd-n64.c
@@ -0,0 +1,297 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *   Sound driver for Nintendo 64.
+ *
+ *   Copyright 2020 Lauri Kasanen
+ */
+
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <sound/core.h>
+#include <sound/control.h>
+#include <sound/pcm.h>
+#include <sound/initval.h>
+
+#include <asm/addrspace.h>
+#include <asm/mach-n64/irq.h>
+
+MODULE_AUTHOR("Lauri Kasanen <cand@gmx.com>");
+MODULE_DESCRIPTION("N64 Audio");
+MODULE_LICENSE("GPL");
+
+#define AI_NTSC_DACRATE 48681812
+#define AI_STATUS_BUSY  (1 << 30)
+#define AI_STATUS_FULL  (1 << 31)
+
+#define REG_BASE ((u32 *) CKSEG1ADDR(0x4500000))
+
+#define AI_ADDR_REG 0
+#define AI_LEN_REG 1
+#define AI_CONTROL_REG 2
+#define AI_STATUS_REG 3
+#define AI_RATE_REG 4
+#define AI_BITCLOCK_REG 5
+
+#define MI_REG_BASE ((u32 *) CKSEG1ADDR(0x4300000))
+
+#define MI_INTR_REG 2
+#define MI_MASK_REG 3
+
+#define MI_INTR_AI 0x04
+
+#define MI_MASK_CLR_AI 0x0010
+#define MI_MASK_SET_AI 0x0020
+
+
+struct n64audio_t {
+	struct snd_card *card;
+
+	void *ring_base;
+	dma_addr_t ring_base_dma;
+
+	struct {
+		struct snd_pcm_substream *substream;
+		int pos, nextpos;
+		u32 writesize;
+		u32 bufsize;
+		spinlock_t lock;
+	} chan;
+};
+
+static void n64audio_write_reg(const u8 reg, const u32 value)
+{
+	__raw_writel(value, REG_BASE + reg);
+}
+
+static void n64mi_write_reg(const u8 reg, const u32 value)
+{
+	__raw_writel(value, MI_REG_BASE + reg);
+}
+
+static u32 n64mi_read_reg(const u8 reg)
+{
+	return __raw_readl(MI_REG_BASE + reg);
+}
+
+static void n64audio_push(struct n64audio_t *priv, uint8_t irq)
+{
+	struct snd_pcm_runtime *runtime = priv->chan.substream->runtime;
+	unsigned long flags;
+	u32 count;
+
+	count = priv->chan.writesize;
+	count &= ~7;
+
+	spin_lock_irqsave(&priv->chan.lock, flags);
+
+	memcpy(priv->ring_base, runtime->dma_area + priv->chan.nextpos, count);
+
+	n64audio_write_reg(AI_ADDR_REG, priv->ring_base_dma);
+	n64audio_write_reg(AI_LEN_REG, count);
+
+	priv->chan.nextpos += count;
+	priv->chan.nextpos %= priv->chan.bufsize;
+	if (irq)
+		priv->chan.pos = priv->chan.nextpos;
+
+	spin_unlock_irqrestore(&priv->chan.lock, flags);
+}
+
+static irqreturn_t n64audio_isr(int irq, void *dev_id)
+{
+	struct n64audio_t *priv = dev_id;
+
+	// Check it's ours
+	const u32 intrs = n64mi_read_reg(MI_INTR_REG);
+	if (!(intrs & MI_INTR_AI))
+		return IRQ_NONE;
+
+	n64audio_write_reg(AI_STATUS_REG, 1);
+
+	n64audio_push(priv, 1);
+	snd_pcm_period_elapsed(priv->chan.substream);
+
+	return IRQ_HANDLED;
+}
+
+static const struct snd_pcm_hardware n64audio_pcm_hw = {
+	.info = (SNDRV_PCM_INFO_MMAP |
+		 SNDRV_PCM_INFO_MMAP_VALID |
+		 SNDRV_PCM_INFO_INTERLEAVED |
+		 SNDRV_PCM_INFO_BLOCK_TRANSFER),
+	.formats =          SNDRV_PCM_FMTBIT_S16_BE,
+	.rates =            SNDRV_PCM_RATE_8000_48000,
+	.rate_min =         8000,
+	.rate_max =         48000,
+	.channels_min =     2,
+	.channels_max =     2,
+	.buffer_bytes_max = 32768,
+	.period_bytes_min = 1024,
+	.period_bytes_max = 32768,
+	.periods_min =      1,
+	.periods_max =      128,
+};
+
+static int n64audio_pcm_open(struct snd_pcm_substream *substream)
+{
+	struct snd_pcm_runtime *runtime = substream->runtime;
+
+	runtime->hw = n64audio_pcm_hw;
+	return 0;
+}
+
+static int n64audio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	struct n64audio_t *priv = substream->pcm->private_data;
+	unsigned long flags;
+	u32 rate;
+
+	rate = ((2 * AI_NTSC_DACRATE / runtime->rate) + 1) / 2 - 1;
+
+	n64audio_write_reg(AI_RATE_REG, rate);
+
+	rate /= 66;
+	if (rate > 16)
+		rate = 16;
+	n64audio_write_reg(AI_BITCLOCK_REG, rate - 1);
+
+	spin_lock_irqsave(&priv->chan.lock, flags);
+
+	/* Setup the pseudo-dma transfer pointers.  */
+	priv->chan.pos = 0;
+	priv->chan.nextpos = 0;
+	priv->chan.substream = substream;
+	priv->chan.writesize = snd_pcm_lib_period_bytes(substream);
+	priv->chan.bufsize = snd_pcm_lib_buffer_bytes(substream);
+
+	spin_unlock_irqrestore(&priv->chan.lock, flags);
+	return 0;
+}
+
+static int n64audio_pcm_trigger(struct snd_pcm_substream *substream,
+				int cmd)
+{
+	switch (cmd) {
+	case SNDRV_PCM_TRIGGER_START:
+		n64audio_push(substream->pcm->private_data, 0);
+		n64audio_write_reg(AI_CONTROL_REG, 1);
+		n64mi_write_reg(MI_MASK_REG, MI_MASK_SET_AI);
+		break;
+	case SNDRV_PCM_TRIGGER_STOP:
+		n64audio_write_reg(AI_CONTROL_REG, 0);
+		n64mi_write_reg(MI_MASK_REG, MI_MASK_CLR_AI);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static snd_pcm_uframes_t n64audio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+	struct n64audio_t *priv = substream->pcm->private_data;
+
+	return bytes_to_frames(substream->runtime,
+			       priv->chan.pos);
+}
+
+static int n64audio_pcm_close(struct snd_pcm_substream *substream)
+{
+	return 0; // Nothing to do, but the kernel crashes if close() doesn't exist
+}
+
+static const struct snd_pcm_ops n64audio_pcm_ops = {
+	.open =		n64audio_pcm_open,
+	.prepare =	n64audio_pcm_prepare,
+	.trigger =	n64audio_pcm_trigger,
+	.pointer =	n64audio_pcm_pointer,
+	.close =	n64audio_pcm_close,
+};
+
+static int __init n64audio_probe(struct platform_device *pdev)
+{
+	struct snd_card *card;
+	struct snd_pcm *pcm;
+	struct n64audio_t *priv;
+	int err;
+
+	err = snd_card_new(&pdev->dev, SNDRV_DEFAULT_IDX1,
+			   SNDRV_DEFAULT_STR1,
+			   THIS_MODULE, 0, &card);
+	if (err < 0)
+		return err;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL) {
+		err = -ENOMEM;
+		goto fail_card;
+	}
+
+	priv->card = card;
+	priv->ring_base = dma_alloc_coherent(card->dev, 32 * 1024,
+					     &priv->ring_base_dma,
+					     GFP_DMA | GFP_KERNEL);
+	if (!priv->ring_base)
+		goto fail_alloc;
+
+	if (request_irq(RCP_IRQ, n64audio_isr,
+				IRQF_SHARED, "N64 Audio", priv)) {
+		err = -EBUSY;
+		goto fail_alloc;
+	}
+
+	spin_lock_init(&priv->chan.lock);
+
+	err = snd_pcm_new(card, "N64 Audio", 0, 1, 0, &pcm);
+	if (err < 0)
+		goto fail_alloc;
+
+	pcm->private_data = priv;
+	strcpy(pcm->name, "N64 Audio");
+
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &n64audio_pcm_ops);
+	snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_VMALLOC, NULL, 0, 0);
+
+	strcpy(card->driver, "N64 Audio");
+	strcpy(card->shortname, "N64 Audio");
+	strcpy(card->longname, "N64 Audio");
+
+	err = snd_card_register(card);
+	if (err < 0)
+		goto fail_alloc;
+
+	platform_set_drvdata(pdev, priv);
+
+	return 0;
+
+fail_alloc:
+	kfree(priv);
+
+fail_card:
+	snd_card_free(card);
+	return err;
+}
+
+static struct platform_driver n64audio_driver = {
+	.driver = {
+		.name = "n64audio",
+	},
+};
+
+static int __init n64audio_init(void)
+{
+	int ret;
+
+	ret = platform_driver_probe(&n64audio_driver, n64audio_probe);
+
+	return ret;
+}
+
+fs_initcall(n64audio_init);