1 C*********************** Contents ****************************************
2 C* Sample driver program, VAX-11 Fortran; **********************************
3 C* HC: O(n^2) time, O(n^2) space hierarchical clustering, Fortran 77 *******
4 C* HCASS: determine cluster-memberships, Fortran 77. ***********************
5 C* HCDEN: draw upper part of dendrogram, VAX-11 Fortran. *******************
6 C* Sample data set: last 36 lines. *****************************************
7 C***************************************************************************
8 C REAL DATA(18,16),CRIT(18),MEMBR(18)
10 C INTEGER IA(18),IB(18)
11 C INTEGER ICLASS(18,9),HVALS(9)
12 C INTEGER IORDER(9),HEIGHT(9)
13 C DIMENSION NN(18),DISNN(18)
16 C IN ABOVE, 18=N, 16=M, 9=LEV, 153=N(N-1)/2.
19 C OPEN(UNIT=21,STATUS='OLD',FILE='SPECTR.DAT')
25 C READ(21,100)(DATA(I,J),J=1,M)
32 C CALL HC(N,M,LEN,IOPT,DATA,IA,IB,CRIT,MEMBR,NN,DISNN,FLAG,D)
36 C CALL HCASS(N,IA,IB,CRIT,LEV,ICLASS,HVALS,IORDER,CRITVAL,HEIGHT)
39 C CALL HCDEN(LEV,IORDER,HEIGHT,CRITVAL)
43 C++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C
45 C HIERARCHICAL CLUSTERING using (user-specified) criterion. C
49 Cremoved DATA(N,M) input data matrix, C
50 C DISS(LEN) dissimilarities in lower half diagonal C
51 C storage; LEN = N.N-1/2, C
52 C IOPT clustering criterion to be used, C
53 C IA, IB, CRIT history of agglomerations; dimensions C
54 C N, first N-1 locations only used, C
55 C MEMBR, NN, DISNN vectors of length N, used to store C
56 C cluster cardinalities, current nearest C
57 C neighbour, and the dissimilarity assoc. C
59 C FLAG boolean indicator of agglomerable obj./ C
62 C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C
64 C------------------------------------------------------------C
65 SUBROUTINE HC(N,M,LEN,IOPT,IA,IB,CRIT,MEMBR,NN,DISNN,
71 DIMENSION NN(N),DISNN(N)
84 C Construct dissimilarity matrix
91 cinput DISS(IND)=DISS(IND)+(DATA(I,K)-DATA(J,K))**2
93 IF (IOPT.EQ.1) DISS(IND)=DISS(IND)/2.
94 C (Above is done for the case of the min. var. method
95 C where merging criteria are defined in terms of variances
96 C rather than distances.)
100 C Carry out an agglomeration - first create list of NNs
106 IF (DISS(IND).GE.DMIN) GOTO 500
116 C Next, determine least diss. using list of NNs
119 IF (.NOT.FLAG(I)) GOTO 600
120 IF (DISNN(I).GE.DMIN) GOTO 600
128 C This allows an agglomeration to be carried out.
136 C Update dissimilarities from new cluster.
141 IF (.NOT.FLAG(K)) GOTO 800
142 IF (K.EQ.I2) GOTO 800
143 X=MEMBR(I2)+MEMBR(J2)+MEMBR(K)
154 IND3=IOFFSET(N,I2,J2)
157 C WARD'S MINIMUM VARIANCE METHOD - IOPT=1.
160 DISS(IND1)=(MEMBR(I2)+MEMBR(K))*DISS(IND1)+
161 X (MEMBR(J2)+MEMBR(K))*DISS(IND2)-
163 DISS(IND1)=DISS(IND1)/X
166 C SINGLE LINK METHOD - IOPT=2.
169 DISS(IND1)=MIN(DISS(IND1),DISS(IND2))
172 C COMPLETE LINK METHOD - IOPT=3.
175 DISS(IND1)=MAX(DISS(IND1),DISS(IND2))
178 C AVERAGE LINK (OR GROUP AVERAGE) METHOD - IOPT=4.
181 DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2))/
182 X (MEMBR(I2)+MEMBR(J2))
185 C MCQUITTY'S METHOD - IOPT=5.
188 DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2)
191 C MEDIAN (GOWER'S) METHOD - IOPT=6.
194 DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2)-0.25*XX
197 C CENTROID METHOD - IOPT=7.
200 DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2)-
201 X MEMBR(I2)*MEMBR(J2)*XX/(MEMBR(I2)+MEMBR(J2)))/
202 X (MEMBR(I2)+MEMBR(J2))
205 IF (I2.GT.K) GOTO 800
206 IF (DISS(IND1).GE.DMIN) GOTO 800
211 MEMBR(I2)=MEMBR(I2)+MEMBR(J2)
215 C Update list of NNs insofar as this is required.
218 IF (.NOT.FLAG(I)) GOTO 900
219 IF (NN(I).EQ.I2) GOTO 850
220 IF (NN(I).EQ.J2) GOTO 850
223 C (Redetermine NN of I:)
227 IF (.NOT.FLAG(J)) GOTO 870
229 IF (DISS(IND).GE.DMIN) GOTO 870
239 C Repeat previous steps until N-1 agglomerations carried out.
241 IF (NCL.GT.1) GOTO 400
248 FUNCTION IOFFSET(N,I,J)
249 C Map row I and column J of upper half diagonal symmetric matrix
251 IOFFSET=J+(I-1)*N-(I*(I+1))/2
254 C+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C
256 C Given a HIERARCHIC CLUSTERING, described as a sequence of C
257 C agglomerations, derive the assignments into clusters for the C
258 C top LEV-1 levels of the hierarchy. C
259 C Prepare also the required data for representing the C
260 C dendrogram of this top part of the hierarchy. C
264 C IA, IB, CRIT: vectors of dimension N defining the agglomer- C
266 C LEV: number of clusters in largest partition. C
267 C HVALS: vector of dim. LEV, used internally only. C
268 C ICLASS: array of cluster assignments; dim. N by LEV. C
269 C IORDER, CRITVAL, HEIGHT: vectors describing the dendrogram, C
272 C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C
276 C Bounds bug fix, Oct. 1990, F. Murtagh. C
277 C Inserted line "IF (LOC.GT.LEV) GOTO 58" on line 48. This was C
278 C occassioned by incorrect termination of this loop when I C
279 C reached its (lower) extremity, i.e. N-LEV. Without the C
280 C /CHECK=(BOUNDS) option on VAX/VMS compilation, this inserted C
281 C statement was not necessary. C
282 C---------------------------------------------------------------C
283 SUBROUTINE HCASS(N,IA,IB,CRIT,LEV,ICLASS,HVALS,IORDER,
285 include 'sizesclu.dat'
286 include 'COMMON.IOUNITS'
287 integer ICLASS(maxconf,maxconf-1)
288 INTEGER IA(N),IB(N),HVALS(LEV),IORDER(LEV),
290 REAL CRIT(N),CRITVAL(LEV)
292 C Pick out the clusters which the N objects belong to,
293 C at levels N-2, N-3, ... N-LEV+1 of the hierarchy.
294 C The clusters are identified by the lowest seq. no. of
296 C There are 2, 3, ... LEV clusters, respectively, for the
297 C above levels of the hierarchy.
304 IF (IA(I).EQ.HVALS(J)) GOTO 54
310 IF (IB(I).EQ.HVALS(J)) GOTO 58
312 IF (LOC.GT.LEV) GOTO 58
318 DO 400 LEVEL=N-LEV,N-2
322 100 IF (IB(ILEV).EQ.ICL) ICL=IA(ILEV)
331 IF (ICLASS(I,J).NE.HVALS(K)) GOTO 110
337 WRITE (iout,450) (j,j=2,LEV)
338 450 FORMAT(4X,' SEQ NOS',8(i2,'CL'),10000(i3,'CL'))
339 WRITE (iout,470) (' ---',j=2,LEV)
340 470 FORMAT(4X,' -------',10000a4)
342 WRITE (iout,600) I,(ICLASS(I,J),J=1,LEV-1)
343 600 FORMAT(I11,8I4,10000i5)
346 C Determine an ordering of the LEV clusters (at level LEV-1)
347 C for later representation of the dendrogram.
348 C These are stored in IORDER.
349 C Determine the associated ordering of the criterion values
350 C for the vertical lines in the dendrogram.
351 C The ordinal values of these criterion values may be used in
352 C preference, and these are stored in HEIGHT.
353 C Finally, note that the LEV clusters are renamed so that they
354 C have seq. nos. 1 to LEV.
363 DO 700 I=N-2,N-LEV+1,-1
365 IF (IA(I).EQ.IORDER(J)) THEN
366 C Shift rightwards and insert IB(I) beside IORDER(J):
367 DO 630 K=LOC+1,J+1,-1
368 IORDER(K)=IORDER(K-1)
369 CRITVAL(K)=CRITVAL(K-1)
370 HEIGHT(K)=HEIGHT(K-1)
374 HEIGHT(J+1)=I-(N-LEV)
381 IF (HVALS(I).EQ.IORDER(J)) THEN
390 C+++++++++++++++++++++++++++++++++++++++++++++++++C
392 C Construct a DENDROGRAM of the top 8 levels of C
393 C a HIERARCHIC CLUSTERING. C
397 C IORDER, HEIGHT, CRITVAL: vectors of length LEV C
398 C defining the dendrogram. C
399 C These are: the ordering of objects C
400 C along the bottom of the dendrogram C
401 C (IORDER); the height of the vertical C
402 C above each object, in ordinal values C
403 C (HEIGHT); and in real values (CRITVAL).C
405 C NOTE: these vectors MUST have been set up with C
406 C LEV = 9 in the prior call to routine C
409 C F. Murtagh, ESA/ESO/STECF, Garching, Feb. 1986.C
411 C-------------------------------------------------C
412 SUBROUTINE HCDEN(LEV,IORDER,HEIGHT,CRITVAL)
413 include 'COMMON.IOUNITS'
415 INTEGER IORDER(LEV),HEIGHT(LEV)
417 INTEGER OUT(3*LEV,3*LEV)
418 INTEGER UP,ACROSS,BLANK
419 DATA UP,ACROSS,BLANK/'|','-',' '/
432 J2=3*LEV+1-3*HEIGHT(I2)
439 IF ( (3*LEV+1-HEIGHT(I3)*3).LT.J2) GOTO 100
453 IF (HEIGHT(L).EQ.IDUM) GOTO 190
456 WRITE(iout,200) CRITVAL(IDUM),(OUT(I,J),J=1,3*LEV)
460 WRITE(iout,210) (OUT(I,J),J=1,3*LEV)
462 200 FORMAT(1H ,8X,F12.2,4X,27000A1)
463 210 FORMAT(1H ,24X,27000A1)
466 WRITE(iout,220)(IORDER(J),J=1,LEV)
468 220 FORMAT(1H ,24X,9000I3)
470 230 FORMAT(1H ,13X,'CRITERION CLUSTERS 1 TO ',i3)
471 WRITE(iout,240) LEV-1
472 240 FORMAT(1H ,13X,'VALUES. (TOP ',i3,' LEVELS OF HIERARCHY).')