/* RLS.ASM      performs the Recursive Least-Squares (RLS) algorithm 
 implemented with a transversal FIR filter structure

	*********************************************************************
 	* 1) x= s*Z(n-1).u  , where s= 1/forgetting factor, n= time index   *
	*        u= [u(n) u(n-1) ... u(n-N+1)]= input samples in delay line *
	*        Z is an N-by-N matrix, N= number of filter weights         *
	* 2) k= x/[1+u.x] where k is an N-by-1 vector                       *
	* 3) Z(n)= s*Z(n-1) - k.xT , xT is the transpose of vector x        *
	* 4) e(n)= d(n) - w(N-1).u , e(n)= filter "a priori" error signal   *
	*        w= [w0 w1 ... wN-1]= filter weights, d(n)= desired output  *
	* 5) w(n)= w(n-1) + k*e(n)                                          *
	*********************************************************************

     Written by: Wassim G. Najm, Analog Devices, DSP division, April 18 1991

     Calling parameters (inputs):
     f0= u(n)= input sample
     f9= d(n)= desired output

     Altered registers:
     f0, f1, f2, f3, f4, f5, f8, f9, f10, f11, f12, f13, f14

     Computation cycles:
     rls_alg: 3N**2+9N+20 per iteration, rls_init: N**2+3N+25

     Results (outputs):
     f13= y(n)= filter output
     f1= e(n)= filter "a priori" error signal
     i0 -> Data Memory Data buffer of the filter weights

     Memory usage:
     pm code= 81 words, pm data= 2N words, dm data= N**2+2N   
*/

#define	TAPS		5
#define FORGET_FACT	1.0
#define INIT_FACT	10.0

#include "macros.h"

.GLOBAL	rls_init, rls_alg;

.SEGMENT/DM	seg_dmda;
.VAR	weights[TAPS];
.VAR	zmatrix[TAPS*TAPS];
.VAR	xvector[TAPS];
.ENDSEG;

.SEGMENT/PM	seg_pmda;
.VAR	deline_data[TAPS];
.VAR	kvector[TAPS];
.ENDSEG;

.SEGMENT/PM	seg_pmco;
rls_init:	b0=weights;
		l0=TAPS;
		b1=zmatrix;
		l1=TAPS*TAPS;
		b3=b1;
		l3=l1;
		b2=xvector;
		l2=TAPS;
		m0=1;
		m2=0;
		m3=-3;
		b8=deline_data;
		l8=TAPS;
		b9=kvector;
		l9=TAPS;
		m8=-1;
		m9=1;
		m11=3;
		f10=2.0;
		f14=1.0;
		f5=1/FORGET_FACT;
		f0=0.0;
		f1=INIT_FACT;
		lcntr=TAPS, do clear_bufs until lce;
		  dm(i0,m0)=f0, pm(i8,m9)=f0;
clear_bufs:	  dm(i2,m0)=f0, pm(i9,m9)=f0;
		dm(i1,m0)=f1;
		lcntr=TAPS-1, do init_zmatrix until lce;
		  lcntr=TAPS, do clear_zel until lce;
clear_zel:          dm(i1,m0)=f0;
init_zmatrix:	  dm(i1,m0)=f1;
		rts;
rls_init.end:

rls_alg:	f4=dm(i0,m0), pm(i8,m8)=f0;
				//f4=w0(n-1), store u(n)
		f8=f0*f4, f4=dm(i0,m0), f0=pm(i8,m8);
				//f8=u(n)*w0(n-1), f4=w1(n-1), f0=u(n-1)
		f12=f0*f4, f4=dm(i0,m0), f0=pm(i8,m8);
				//f12=u(n-1)*w1(n-1), f4=w2(n-1), f0=u(n-2)
		lcntr=TAPS-3, do mac1 until lce;
mac1:		  f12=f0*f4, f8=f8+f12, f4=dm(i0,m0), f0=pm(i8,m8);
				//f12=u(n-i)*wi(n-1), f8=sum of prod, 
				//f4=wi+1(n-1), f0=u(n-i-1)
		f12=f0*f4, f8=f8+f12, f4=dm(i1,m0), f0=pm(i8,m8);
				//f12=u(n-N+1)*wN-1(n-1), f4=Z0(0,n-1), f0=u(n)
		f8=f0*f4, f13=f8+f12, f4=dm(i1,m0), f0=pm(i8,m8);
				//f13=y(n), f8= u(n)*Z0(0,n-1), f4=Z0(1,n-1), f0=u(n-1)
		f12=f0*f4, f1=f9-f13, f4=dm(i1,m0), f0=pm(i8,m8);
				//f1=e(n), f12=u(n-1)*Z0(1,n-1), f4=Z0(2,n-1),f0=u(n-2)

		lcntr=TAPS, do compute_xn until lce;
		  lcntr=TAPS-3, do mac2 until lce;
mac2:		    f12=f0*f4, f8=f8+f12, f4=dm(i1,m0), f0=pm(i8,m8);
				//f12=u(n-i)*Zk(i,n-1), f8=sum of prod,
				//f4=Zk(i+1,n-1), f0=u(n-i-1)
		  f12=f0*f4, f8=f8+f12, f4=dm(i1,m0), f0=pm(i8,m8);
				//f12=u(n-N+1)*Zk(N-1,n-1), f4=Zk+1(0,n-1), f0=u(n)
		  f8=f0*f4, f2=f8+f12, f4=dm(i1,m0), f0=pm(i8,m8);
				//f2=xk(n),f8=u(n)*Zk+1(0,n-1),f4=Zk+1(1,n-1),f0=u(n-1)
		  f12=f0*f4, f4=dm(i1,m0), f0=pm(i8,m8);
				//f12=u(n-1)*Zk+1(1,n-1), f4=Zk+1(2,n-1), f0=u(n-2)
compute_xn:	  dm(i2,m0)=f2; 	//store xk(n)
		f4=dm(i1,m3), f0=pm(i8,m11); //i1 -> Z0(0,n-1), i8 -> u(n)
		
		f4=dm(i2,m0), f0=pm(i8,m8); //f4= x0(n), f0= u(n)
		f8=f0*f4, f4=dm(i2,m0), f0=pm(i8,m8);
				//f8=x0(n)*u(n), f4=x1(n), f0=u(n-1)
		f12=f0*f4, f8=f8+f14, f4=dm(i2,m0), f0=pm(i8,m8);
				//f12=x1(n)*u(n-1), f8=1+x0(n)*u(n), f4=x2(n),f0=u(n-2)
		lcntr=TAPS-3, do mac3 until lce;
mac3:		  f12=f0*f4, f8=f8+f12, f4=dm(i2,m0), f0=pm(i8,m8);
				//f12=xi(n)*u(n-i), f8=1+sum of prod, f4=xi+1(n),
				//f0=u(n-i-1)
		f12=f0*f4, f8=f8+f12, f4=f14; 	//f12=u(n-N+1)*xN-1(n), f4=1.
		f12=f8+f12, f3=dm(i2,m0); 		//f12=1+u.x, f3= x0(n)
		DIVIDE(f2,f4,f12,f10,f0); 		//f2=1/(1+u.x)
		f0=f2*f3, modify(i8,m9); 		//f0=k0(n), 
					  	//i8 -> u(n+1) location in delay line

		lcntr=TAPS-1, do comp_kn_wn until lce;
		  f8=f0*f1, f12=dm(i0,m2), pm(i9,m9)=f0;
				//f8= ki(n)*e(n), f12=wi(n-1), store ki(n)
		  f8=f8+f12, f3=dm(i2,m0); 		//f8=wi(n), f3=xi+1(n)
comp_kn_wn:	  f0=f2*f3, dm(i0,m0)=f8; 	//f0=ki+1(n), store wi(n)
		f8=f0*f1, f12=dm(i0,m2), pm(i9,m9)=f0;
				//f8=kN-1(n)*e(n), f12=wN-1(n-1), store kN-1(n)
		f11=f8+f12, f2=dm(i2,m0), f4=pm(i9,m9);
				//f11= wN-1(n), f2= x0(n), f4= k0(n)

		f12=f2*f4, f8=dm(i1,m0), f4=pm(i9,m9);
				//f12= x0(n)*k0(n), f8=Z0(0,n-1), f4= k1(n)
		lcntr=TAPS-1, do update_zn until lce;
		  f12=f2*f4, f0=f8-f12, f8=dm(i1,m0);
		  lcntr=TAPS-2, do update_zrow until lce;
		    f3=f0*f5, f0=f8-f12, f8=dm(i1,m0), f4=pm(i9,m9);
update_zrow:	    f12=f2*f4, dm(i3,m0)=f3;
		  f3=f0*f5, f0=f8-f12, f2=dm(i2,m0);
		  f0=f0*f5, dm(i3,m0)=f3, f4=pm(i9,m9);
		  f12=f2*f4, dm(i3,m0)=f0, f4=pm(i9,m9);
update_zn:	  f8=dm(i1,m0);
		f12=f2*f4, f0=f8-f12, f8=dm(i1,m0);
		lcntr=TAPS-2, do update_zlastrow until lce;
		  f3=f0*f5, f0=f8-f12, f8=dm(i1,m0), f4=pm(i9,m9);
update_zlastrow:  f12=f2*f4, dm(i3,m0)=f3;
		f3=f0*f5, f0=f8-f12, dm(i0,m0)=f11;
		rts(db);
		f0=f0*f5, dm(i3,m0)=f3;
		dm(i3,m0)=f0;

rls_alg.end:		
.ENDSEG;
 
