C*********************** Contents **************************************** C* Sample driver program, VAX-11 Fortran; ********************************** C* HC: O(n^2) time, O(n^2) space hierarchical clustering, Fortran 77 ******* C* HCASS: determine cluster-memberships, Fortran 77. *********************** C* HCDEN: draw upper part of dendrogram, VAX-11 Fortran. ******************* C* Sample data set: last 36 lines. ***************************************** C*************************************************************************** C REAL DATA(18,16),CRIT(18),MEMBR(18) C REAL CRITVAL(9) C INTEGER IA(18),IB(18) C INTEGER ICLASS(18,9),HVALS(9) C INTEGER IORDER(9),HEIGHT(9) C DIMENSION NN(18),DISNN(18) C REAL D(153) C LOGICAL FLAG(18) C IN ABOVE, 18=N, 16=M, 9=LEV, 153=N(N-1)/2. C C C OPEN(UNIT=21,STATUS='OLD',FILE='SPECTR.DAT') C C C N = 18 C M = 16 C DO I=1,N C READ(21,100)(DATA(I,J),J=1,M) C ENDDO C 100 FORMAT(8F7.1) C C C LEN = (N*(N-1))/2 C IOPT=1 C CALL HC(N,M,LEN,IOPT,DATA,IA,IB,CRIT,MEMBR,NN,DISNN,FLAG,D) C C C LEV = 9 C CALL HCASS(N,IA,IB,CRIT,LEV,ICLASS,HVALS,IORDER,CRITVAL,HEIGHT) C C C CALL HCDEN(LEV,IORDER,HEIGHT,CRITVAL) C C C END C++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C HIERARCHICAL CLUSTERING using (user-specified) criterion. C C C C Parameters: C C C Cremoved DATA(N,M) input data matrix, C C DISS(LEN) dissimilarities in lower half diagonal C C storage; LEN = N.N-1/2, C C IOPT clustering criterion to be used, C C IA, IB, CRIT history of agglomerations; dimensions C C N, first N-1 locations only used, C C MEMBR, NN, DISNN vectors of length N, used to store C C cluster cardinalities, current nearest C C neighbour, and the dissimilarity assoc. C C with the latter. C C FLAG boolean indicator of agglomerable obj./ C C clusters. C C C C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C C C C------------------------------------------------------------C SUBROUTINE HC(N,M,LEN,IOPT,IA,IB,CRIT,MEMBR,NN,DISNN, X FLAG,DISS) REAL MEMBR(N) REAL DISS(LEN) INTEGER IA(N),IB(N) REAL CRIT(N) DIMENSION NN(N),DISNN(N) LOGICAL FLAG(N) REAL INF DATA INF/1.E+20/ C C Initializations C DO I=1,N MEMBR(I)=1. FLAG(I)=.TRUE. ENDDO NCL=N C C Construct dissimilarity matrix C DO I=1,N-1 DO J=I+1,N IND=IOFFSET(N,I,J) cinput DISS(IND)=0. cinput DO K=1,M cinput DISS(IND)=DISS(IND)+(DATA(I,K)-DATA(J,K))**2 cinput ENDDO IF (IOPT.EQ.1) DISS(IND)=DISS(IND)/2. C (Above is done for the case of the min. var. method C where merging criteria are defined in terms of variances C rather than distances.) ENDDO ENDDO C C Carry out an agglomeration - first create list of NNs C DO I=1,N-1 DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (DISS(IND).GE.DMIN) GOTO 500 DMIN=DISS(IND) JM=J 500 CONTINUE ENDDO NN(I)=JM DISNN(I)=DMIN ENDDO C 400 CONTINUE C Next, determine least diss. using list of NNs DMIN=INF DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 600 IF (DISNN(I).GE.DMIN) GOTO 600 DMIN=DISNN(I) IM=I JM=NN(I) 600 CONTINUE ENDDO NCL=NCL-1 C C This allows an agglomeration to be carried out. C I2=MIN0(IM,JM) J2=MAX0(IM,JM) IA(N-NCL)=I2 IB(N-NCL)=J2 CRIT(N-NCL)=DMIN C C Update dissimilarities from new cluster. C FLAG(J2)=.FALSE. DMIN=INF DO K=1,N IF (.NOT.FLAG(K)) GOTO 800 IF (K.EQ.I2) GOTO 800 X=MEMBR(I2)+MEMBR(J2)+MEMBR(K) IF (I2.LT.K) THEN IND1=IOFFSET(N,I2,K) ELSE IND1=IOFFSET(N,K,I2) ENDIF IF (J2.LT.K) THEN IND2=IOFFSET(N,J2,K) ELSE IND2=IOFFSET(N,K,J2) ENDIF IND3=IOFFSET(N,I2,J2) XX=DISS(IND3) C C WARD'S MINIMUM VARIANCE METHOD - IOPT=1. C IF (IOPT.EQ.1) THEN DISS(IND1)=(MEMBR(I2)+MEMBR(K))*DISS(IND1)+ X (MEMBR(J2)+MEMBR(K))*DISS(IND2)- X MEMBR(K)*XX DISS(IND1)=DISS(IND1)/X ENDIF C C SINGLE LINK METHOD - IOPT=2. C IF (IOPT.EQ.2) THEN DISS(IND1)=MIN(DISS(IND1),DISS(IND2)) ENDIF C C COMPLETE LINK METHOD - IOPT=3. C IF (IOPT.EQ.3) THEN DISS(IND1)=MAX(DISS(IND1),DISS(IND2)) ENDIF C C AVERAGE LINK (OR GROUP AVERAGE) METHOD - IOPT=4. C IF (IOPT.EQ.4) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C C MCQUITTY'S METHOD - IOPT=5. C IF (IOPT.EQ.5) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2) ENDIF C C MEDIAN (GOWER'S) METHOD - IOPT=6. C IF (IOPT.EQ.6) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2)-0.25*XX ENDIF C C CENTROID METHOD - IOPT=7. C IF (IOPT.EQ.7) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2)- X MEMBR(I2)*MEMBR(J2)*XX/(MEMBR(I2)+MEMBR(J2)))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C IF (I2.GT.K) GOTO 800 IF (DISS(IND1).GE.DMIN) GOTO 800 DMIN=DISS(IND1) JJ=K 800 CONTINUE ENDDO MEMBR(I2)=MEMBR(I2)+MEMBR(J2) DISNN(I2)=DMIN NN(I2)=JJ C C Update list of NNs insofar as this is required. C DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 900 IF (NN(I).EQ.I2) GOTO 850 IF (NN(I).EQ.J2) GOTO 850 GOTO 900 850 CONTINUE C (Redetermine NN of I:) DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (.NOT.FLAG(J)) GOTO 870 IF (I.EQ.J) GOTO 870 IF (DISS(IND).GE.DMIN) GOTO 870 DMIN=DISS(IND) JJ=J 870 CONTINUE ENDDO NN(I)=JJ DISNN(I)=DMIN 900 CONTINUE ENDDO C C Repeat previous steps until N-1 agglomerations carried out. C IF (NCL.GT.1) GOTO 400 C C RETURN END C C FUNCTION IOFFSET(N,I,J) C Map row I and column J of upper half diagonal symmetric matrix C onto vector. IOFFSET=J+(I-1)*N-(I*(I+1))/2 RETURN END C+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C Given a HIERARCHIC CLUSTERING, described as a sequence of C C agglomerations, derive the assignments into clusters for the C C top LEV-1 levels of the hierarchy. C C Prepare also the required data for representing the C C dendrogram of this top part of the hierarchy. C C C C Parameters: C C C C IA, IB, CRIT: vectors of dimension N defining the agglomer- C C ations. C C LEV: number of clusters in largest partition. C C HVALS: vector of dim. LEV, used internally only. C C ICLASS: array of cluster assignments; dim. N by LEV. C C IORDER, CRITVAL, HEIGHT: vectors describing the dendrogram, C C all of dim. LEV. C C C C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C C C C HISTORY C C C C Bounds bug fix, Oct. 1990, F. Murtagh. C C Inserted line "IF (LOC.GT.LEV) GOTO 58" on line 48. This was C C occassioned by incorrect termination of this loop when I C C reached its (lower) extremity, i.e. N-LEV. Without the C C /CHECK=(BOUNDS) option on VAX/VMS compilation, this inserted C C statement was not necessary. C C---------------------------------------------------------------C SUBROUTINE HCASS(N,IA,IB,CRIT,LEV,ICLASS,HVALS,IORDER, X CRITVAL,HEIGHT) include 'sizesclu.dat' include 'COMMON.IOUNITS' integer ICLASS(maxconf,maxconf-1) INTEGER IA(N),IB(N),HVALS(LEV),IORDER(LEV), X HEIGHT(LEV) REAL CRIT(N),CRITVAL(LEV) C C Pick out the clusters which the N objects belong to, C at levels N-2, N-3, ... N-LEV+1 of the hierarchy. C The clusters are identified by the lowest seq. no. of C their members. C There are 2, 3, ... LEV clusters, respectively, for the C above levels of the hierarchy. C HVALS(1)=1 HVALS(2)=IB(N-1) LOC=3 DO 59 I=N-2,N-LEV,-1 DO 52 J=1,LOC-1 IF (IA(I).EQ.HVALS(J)) GOTO 54 52 CONTINUE HVALS(LOC)=IA(I) LOC=LOC+1 54 CONTINUE DO 56 J=1,LOC-1 IF (IB(I).EQ.HVALS(J)) GOTO 58 56 CONTINUE IF (LOC.GT.LEV) GOTO 58 HVALS(LOC)=IB(I) LOC=LOC+1 58 CONTINUE 59 CONTINUE C DO 400 LEVEL=N-LEV,N-2 DO 200 I=1,N ICL=I DO 100 ILEV=1,LEVEL 100 IF (IB(ILEV).EQ.ICL) ICL=IA(ILEV) NCL=N-LEVEL ICLASS(I,NCL-1)=ICL 200 CONTINUE 400 CONTINUE C DO 120 I=1,N DO 120 J=1,LEV-1 DO 110 K=2,LEV IF (ICLASS(I,J).NE.HVALS(K)) GOTO 110 ICLASS(I,J)=K GOTO 120 110 CONTINUE 120 CONTINUE C WRITE (iout,450) (j,j=2,LEV) 450 FORMAT(4X,' SEQ NOS',8(i2,'CL'),10000(i3,'CL')) WRITE (iout,470) (' ---',j=2,LEV) 470 FORMAT(4X,' -------',10000a4) DO 500 I=1,N WRITE (iout,600) I,(ICLASS(I,J),J=1,LEV-1) 600 FORMAT(I11,8I4,10000i5) 500 CONTINUE C C Determine an ordering of the LEV clusters (at level LEV-1) C for later representation of the dendrogram. C These are stored in IORDER. C Determine the associated ordering of the criterion values C for the vertical lines in the dendrogram. C The ordinal values of these criterion values may be used in C preference, and these are stored in HEIGHT. C Finally, note that the LEV clusters are renamed so that they C have seq. nos. 1 to LEV. C IORDER(1)=IA(N-1) IORDER(2)=IB(N-1) CRITVAL(1)=0.0 CRITVAL(2)=CRIT(N-1) HEIGHT(1)=LEV HEIGHT(2)=LEV-1 LOC=2 DO 700 I=N-2,N-LEV+1,-1 DO 650 J=1,LOC IF (IA(I).EQ.IORDER(J)) THEN C Shift rightwards and insert IB(I) beside IORDER(J): DO 630 K=LOC+1,J+1,-1 IORDER(K)=IORDER(K-1) CRITVAL(K)=CRITVAL(K-1) HEIGHT(K)=HEIGHT(K-1) 630 CONTINUE IORDER(J+1)=IB(I) CRITVAL(J+1)=CRIT(I) HEIGHT(J+1)=I-(N-LEV) LOC=LOC+1 ENDIF 650 CONTINUE 700 CONTINUE DO 705 I=1,LEV DO 703 J=1,LEV IF (HVALS(I).EQ.IORDER(J)) THEN IORDER(J)=I GOTO 705 ENDIF 703 CONTINUE 705 CONTINUE C RETURN END C+++++++++++++++++++++++++++++++++++++++++++++++++C C C C Construct a DENDROGRAM of the top 8 levels of C C a HIERARCHIC CLUSTERING. C C C C Parameters: C C C C IORDER, HEIGHT, CRITVAL: vectors of length LEV C C defining the dendrogram. C C These are: the ordering of objects C C along the bottom of the dendrogram C C (IORDER); the height of the vertical C C above each object, in ordinal values C C (HEIGHT); and in real values (CRITVAL).C C C C NOTE: these vectors MUST have been set up with C C LEV = 9 in the prior call to routine C C HCASS. C C C F. Murtagh, ESA/ESO/STECF, Garching, Feb. 1986.C C C C-------------------------------------------------C SUBROUTINE HCDEN(LEV,IORDER,HEIGHT,CRITVAL) include 'COMMON.IOUNITS' CHARACTER*80 LINE INTEGER IORDER(LEV),HEIGHT(LEV) REAL CRITVAL(LEV) INTEGER OUT(3*LEV,3*LEV) INTEGER UP,ACROSS,BLANK DATA UP,ACROSS,BLANK/'|','-',' '/ C C DO I=1,3*LEV DO J=1,3*LEV OUT(I,J)=BLANK ENDDO ENDDO C C DO I=3,3*LEV,3 I2=I/3 C J2=3*LEV+1-3*HEIGHT(I2) DO J=3*LEV,J2,-1 OUT(J,I)=UP ENDDO C DO K=I,3,-1 I3=INT((K+2)/3) IF ( (3*LEV+1-HEIGHT(I3)*3).LT.J2) GOTO 100 OUT(J2,K)=ACROSS ENDDO 100 CONTINUE C ENDDO C C IC=3 DO I=1,3*LEV IF (I.EQ.IC+1) THEN IDUM=IC/3 IDUM=LEV-IDUM DO L=1,LEV IF (HEIGHT(L).EQ.IDUM) GOTO 190 ENDDO 190 IDUM=L WRITE(iout,200) CRITVAL(IDUM),(OUT(I,J),J=1,3*LEV) IC=IC+3 ELSE LINE = ' ' WRITE(iout,210) (OUT(I,J),J=1,3*LEV) ENDIF 200 FORMAT(1H ,8X,F12.2,4X,27000A1) 210 FORMAT(1H ,24X,27000A1) ENDDO WRITE(iout,250) WRITE(iout,220)(IORDER(J),J=1,LEV) WRITE(iout,250) 220 FORMAT(1H ,24X,9000I3) WRITE(iout,230) LEV 230 FORMAT(1H ,13X,'CRITERION CLUSTERS 1 TO ',i3) WRITE(iout,240) LEV-1 240 FORMAT(1H ,13X,'VALUES. (TOP ',i3,' LEVELS OF HIERARCHY).') 250 FORMAT(/) C C RETURN END