aboutsummaryrefslogtreecommitdiff
path: root/about-architecture.html
blob: fee6e5fe5beb47de48050d545118e462040e7c29 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
<!DOCTYPE html>
<!-- Copyright notes are just below the head and before body -->

    <html lang="en-US">

        <!-- HTML Header -->
        <head>
            <!-- Title of the page. -->
            <title>Maneage -- Managing data lineage</title>

            <!-- Enable UTF-8 encoding to easily use non-ASCII charactes -->
            <meta charset="UTF-8">
            <meta http-equiv="Content-type" content="text/html; charset=UTF-8">

            <!-- Put logo beside the address bar -->
            <link rel="shortcut icon" href="./img/favicon.svg" />

            <!-- The viewport meta tag is placed mainly for mobile browsers
                that are pre-configured in different ways (for example setting the
                different widths for the page than the actual width of the device,
                or zooming to different values. Without this the CSS media
                solutions might not work properly on all mobile browsers.-->
                <meta name="viewport"
                      content="width=device-width, initial-scale=1">

                <!-- Basic styles -->
                <link rel="stylesheet" href="css/base.css" />
        </head>

        <!--
            Webpage of Maneage: a framework for managing data lineage

            Copyright (C) 2020-2022 Pedram Ashofteh Ardakani <pedramardakani@pm.me>
            Copyright (C) 2020-2022 Mohammad Akhlaghi <mohammad@akhlaghi.org>

            This file is part of Maneage. Maneage is free software: you can
            redistribute it and/or modify it under the terms of the GNU General
            Public License as published by the Free Software Foundation, either
            version 3 of the License, or (at your option) any later version.

            Maneage is distributed in the hope that it will be useful, but
            WITHOUT ANY WARRANTY; without even the implied warranty of
            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
            General Public License for more details. See
            <http://www.gnu.org/licenses/>.  -->

        <!-- Start the main body. -->
        <body>
            <div id="container">
                <header role="banner">
                    <!-- global navigation -->
                    <nav role="navigation" id="nav-hamburger-wrapper">
                        <input type="checkbox" id="nav-hamburger-input"/>
                        <label for="nav-hamburger-input">|||</label>
                        <div id="nav-hamburger-items" class="button">
                            <a href="index.html">Home</a>
                            <a href="about.html">About</a>
                            <a href="http://git.maneage.org/project.git/">Git</a>
                            <a href="tutorial.html">Tutorial</a>
                        </div>
                    </nav>
                </header>
                <div class="banner">
                    <div>
                        <a href="index.html"><img src="img/maneage-logo.svg" /></a>
                    </div>
                    <div>
                        <h1>Maneage</h1><h2>Managing Data Lineage</h2>
                        <p>Copyright &copy; 2018-2022 Mohammad Akhlaghi <a href="&#109;&#x61;&#x69;&#x6C;&#x74;&#x6F;:&#x6D;&#111;&#104;&#97;&#x6D;&#109;a&#x64;&#64;&#x61;&#107;&#x68;&#x6C;&#x61;&#x67;&#104;&#x69;.&#x6F;&#x72;&#103;">&#x6D;&#111;&#104;&#97;&#x6D;&#109;a&#x64;&#64;&#x61;&#107;&#x68;&#x6C;&#x61;&#x67;&#104;&#x69;.&#x6F;&#x72;&#103;</a><br />
                        Copyright &copy; 2020-2022 Raul Infante-Sainz <a href="m&#x61;&#105;&#108;t&#111;:&#x69;&#x6E;&#x66;&#x61;&#x6E;&#116;&#101;&#115;&#97;&#x69;n&#122;&#64;&#103;&#x6D;&#x61;&#x69;&#x6C;&#x2E;&#x63;&#111;&#x6D;">&#x69;&#x6E;&#x66;&#x61;&#x6E;&#116;&#101;&#115;&#97;&#x69;n&#122;&#64;&#103;&#x6D;&#x61;&#x69;&#x6C;&#x2E;&#x63;&#111;&#x6D;</a><br />
                        <a href="#page-footer">License Conditions</a></p>
                    </div>
                </div>





		<hr />
		<p align="right">Next: <a href="about-customize.html">Customization checklist</a>, Previous: <a href="about-make.html">Why Make?</a>, Up: <a href="about.html">About</a> </p>

                <h2>Project architecture</h2>

                <p>In order to customize Maneage to your research, it
                is important to first understand its architecture so
                you can navigate your way in the directories and
                understand how to implement your research project
                within its framework: where to add new files and which
                existing files to modify for what purpose. </p>

                <p>The project has two top-level directories: <code>reproduce</code> and
                <code>tex</code>. <code>reproduce</code> hosts all the
                software building and analysis steps. <code>tex</code>
                contains all the final paper's components to be
                compiled into a PDF using LaTeX. The image below shows
                the directory and file structure in a hypothetical
                project using Maneage. Files are shown with small,
                green boxes that have a suffix in their names (for
                example <code>format.mk</code> or
                <code>download.tex</code>).  Directories (containing
                multiple files) are shown as large, brown boxes, where
                the name ends in a slash (<code>/</code>).
                Directories with dashed lines and no files (just a
                description) are symbolic links that are created after
                building the project, pointing to commonly needed
                built directories.  Symbolic links and their contents
                are not considered part of the source and are not
                under version control.  Files and directories are
                shown within their parent directory.  For example the
                full address of <code>format.mk</code> from the top
                project directory is
                <code>reproduce/analysis/make/format.mk</code>.</p>

		<img class="center" src="img/maneage-file-structure.png" width="90%" />
		<p>&nbsp;</p>

                <p>As shown above, the <code>reproduce</code>
                directory has two
                sub-directories: <code>software</code> and
                <code>analysis</code>. As the name says, the former
                contains all the instructions to download, build and
                install (independent of the host operating system) the
                necessary software (these are called by
                the <code>./project configure</code> command). The
                latter contains instructions on how to use those
                software to do your project's analysis.</p>

                <p>After it finishes, <code>./project configure</code>
                will create the following symbolic links in the
                project's top source directory: <code>.build</code>
                which points to the top build directory
                and <code>.local</code> for easy access to the custom
                built software installation directory. With these you
                can easily access the build directory and
                project-specific software from your top source
                directory. For example if you
                run <code>.local/bin/ls</code> you will be using
                the <code>ls</code> of Maneage, which is probably
                different from your system's <code>ls</code> (run them
                both with
                <code>--version</code> to check).</p>

                <p>Once the project is configured for your
                system, <code>./project make</code> will do the basic
                preparations and run the project's analysis with the
                custom version of software. The <code>project</code>
                script is just a wrapper, and with the
                <code>make</code> argument, it will first call <code>top-prepare.mk</code> and <code>top-make.mk</code>
                (both are in the <code>reproduce/analysis/make</code> directory).</p>

                <p>In terms of
                organization, <code>top-prepare.mk</code>
                and <code>top-make.mk</code> have an identical design,
                only minor differences. So, let's continue Maneage's
                architecture with <code>top-make.mk</code>. Once you
                understand that, you'll clearly
                understand <code>top-prepare.mk</code> also. These
                very high-level files are relatively short and heavily
                commented so hopefully the descriptions in each
                comment will be enough to understand the general
                details. As you read this section, please also look at
                the contents of the mentioned files and directories to
                fully understand what is going on.</p>

                <p>Before starting to look into the
                top <code>top-make.mk</code>, it is important to
                recall that Make defines dependencies by
                files. Therefore, the input/prerequisite and output of
                every step/rule must be a file. Also recall that Make
                will use the modification date of the prerequisite(s)
                and target files to see if the target must be re-built
                or not. Therefore during the processing, <em>many</em>
                intermediate files will be created (see the tips
                section below on a good strategy to deal with
                large/huge files).</p>

                <p>To keep the source and (intermediate) built files
                separate, the user <em>must</em> define a top-level
                build directory variable (or <code>$(BDIR)</code>) to
                host all the intermediate files (you defined it
                during <code>./project configure</code>). This
                directory doesn't need to be version controlled or
                even synchronized, or backed-up in other servers: its
                contents are all products, and can be easily
                re-created any time. As you define targets for your
                new rules, it is thus important to place them all
                under sub-directories of <code>$(BDIR)</code>. As
                mentioned above, you always have fast access to this
                "build"-directory with the <code>.build</code>
                symbolic link. Also, beware to <em>never</em> make any
                manual change in the files of the build-directory,
                just delete them (so they are re-built).</p>

                <p>In this architecture, we have two types of
                Makefiles that are loaded into the
                top <code>Makefile</code>: <em>configuration-Makefiles</em>
                (only independent variables/configurations)
                and <em>workhorse-Makefiles</em> (Makefiles that
                actually contain analysis/processing rules).</p>

                <p>The configuration-Makefiles are those that satisfy
                these two wildcards:
                <code>reproduce/software/config/*.conf</code> (for
                building the necessary software when you
                run <code>./project configure</code>)
                and <code>reproduce/analysis/config/*.conf</code> (for
                the high-level analysis, when you run <code>./project
                make</code>). These Makefiles don't actually have any
                rules, they just have values for various free
                parameters throughout the configuration or
                analysis. Open a few of them to see for
                yourself. These Makefiles must only contain raw Make
                variables (project configurations). By "raw" we mean
                that the Make variables in these files must not depend
                on variables in any other configuration-Makefile. This
                is because we don't want to assume any order in
                reading them. It is also very important
                to <em>not</em> define any rule, or other Make
                construct, in these configuration-Makefiles.</p>

                <p>Following this rule-of-thumb enables you to set
                these configure-Makefiles as a prerequisite to any
                target that depends on their variable
                values. Therefore, if you change any of their values,
                all targets that depend on those values will be
                re-built. This is very convenient as your project
                scales up and gets more complex.</p>

                <p>The workhorse-Makefiles are those satisfying this wildcard
                <code>reproduce/software/make/*.mk</code>
                and <code>reproduce/analysis/make/*.mk</code>. They
                contain the details of the processing steps (Makefiles
                containing rules). Therefore, in this phase <em>order
                is important</em>, because the prerequisites of most
                rules will be the targets of other rules that will be
                defined prior to them (not a fixed name
                like <code>paper.pdf</code>). The lower-level rules
                must be imported into Make before the higher-level
                ones.</p>

                <p>All processing steps are assumed to ultimately
                (usually after many rules) end up in some number,
                image, figure, or table that will be included in the
                paper. The writing of these results into the final
                report/paper is managed through separate LaTeX files
                that only contain macros (a name given to a
                number/string to be used in the LaTeX source, which
                will be replaced when compiling it to the final
                PDF). So the last target in a workhorse-Makefile is
                a <code>.tex</code> file (with the same base-name as
                the Makefile, but
                in <code>$(BDIR)/tex/macros</code>). As a result, if
                the targets in a workhorse-Makefile aren't directly a
                prerequisite of other workhorse-Makefile targets, they
                can be a prerequisite of that intermediate LaTeX macro
                file and thus be called when necessary. Otherwise,
                they will be ignored by Make.</p>

                <p>Maneage also has a mode to share the build
                directory between several users of a Unix group (when
                working on large computer clusters). In this scenario,
                each user can have their own cloned project source,
                but share the large built files between each other. To
                do this, it is necessary for all built files to give
                full permission to group members while not allowing
                any other users access to the contents. Therefore
                the <code>./project configure</code> and
                <code>./project make</code> steps must be called with
                special conditions which are managed in
                the <code>--group</code> option.</p>

                <p>Let's see how this design is implemented. Please
                open and inspect
                <code>top-make.mk</code> it as we go along here. The
                first step (un-commented line) is to import the local
                configuration (your answers to the questions of
                <code>./project configure</code>). They are defined in
                the configuration-Makefile
                <code>reproduce/software/config/LOCAL.conf</code>
                    which was also built by <code>./project
                    configure</code> (based on
                    the <code>LOCAL.conf.in</code> template of the
                    same directory).</p>

                <p>The next non-commented set of the
                top <code>Makefile</code> defines the ultimate target
                of the whole project (<code>paper.pdf</code>). But to
                avoid mistakes, a sanity check is necessary to see if
                Make is being run with the same group settings as the
                configure script (for example when the project is
                configured for group access using
                the <code>./for-group</code> script, but Make
                isn't). Therefore we use a Make conditional to define
                the <code>all</code> target based on the group
                permissions.</p>

                <p>Having defined the top/ultimate target, our next
                step is to include all the other necessary
                Makefiles. However, order matters in the importing of
                workhorse-Makefiles and each must also have a TeX
                macro file with the same base name (without a
                suffix). Therefore, the next step in the top-level
                Makefile is to define the <code>makesrc</code>
                variable to keep the base names (without
                a <code>.mk</code> suffix) of the workhorse-Makefiles
                that must be imported, in the proper order.</p>

                <p>Finally, we import all the necessary remaining
                Makefiles: 1) All the analysis configuration-Makefiles
                with a wildcard. 2) The software
                configuration-Makefile that contains their version
                (just in case its necessary). 3) All
                workhorse-Makefiles in the proper order using a Make
                <code>foreach</code> loop.</p>

                <p>In short, to keep things modular, readable and
                manageable, follow these recommendations: 1) Set
                clear-to-understand names for the
                configuration-Makefiles, and workhorse-Makefiles, 2)
                Only import other Makefiles from top Makefile. These
                will let you know/remember generally which step you
                are taking before or after another. Projects will
                scale up very fast. Thus if you don't start and
                continue with a clean and robust convention like this,
                in the end it will become very dirty and hard to
                manage/understand (even for yourself). As a general
                rule of thumb, break your rules into as many
                logically-similar but independent steps as
                possible.</p>

                <p>The <code>reproduce/analysis/make/paper.mk</code>
                Makefile must be the final Makefile that is
                included. This workhorse Makefile ends with the rule
                to build
                <code>paper.pdf</code> (final target of the whole
                project). If you look in it, you will notice that this
                Makefile starts with a rule to create
                <code>$(mtexdir)/project.tex</code>
                (<code>mtexdir</code> is just a shorthand name for
                <code>$(BDIR)/tex/macros</code> mentioned before). As
                you see, the only dependency of
                <code>$(mtexdir)/project.tex</code>
                is <code>$(mtexdir)/verify.tex</code> (which is the
                last analysis step: it verifies all the generated
                results).  Therefore,
                <code>$(mtexdir)/project.tex</code> is <em>the
                connection</em> between the processing/analysis steps
                of the project, and the steps to build the final
                PDF.</p>

                <p>During the research, it often happens that you want
                to test a step that is not a prerequisite of any
                higher-level operation. In such cases, you can
                (temporarily) define that processing as a rule in the
                most relevant workhorse-Makefile and set its target as
                a prerequisite of its TeX macro. If your test gives a
                promising result and you want to include it in your
                research, set it as prerequisites to other rules and
                remove it from the list of prerequisites for TeX macro
                file. In fact, this is how a project is designed to
                grow in this framework.</p>

                <h3>File modification dates (meta data)</h3>

                <p>While Git does an excellent job at keeping a
                history of the contents of files, it makes no effort
                in keeping the file meta data, and in particular the
                dates of files. Therefore when you checkout to a
                different branch, files that are re-written by Git
                will have a newer date than the other project
                files. However, file dates are important in the
                current design of Maneage: Make checks the dates of
                the prerequisite files and target files to see if the
                target should be re-built.</p>

                <p>To fix this problem, for Maneage we use a forked
                version of
                <a href="https://github.com/mohammad-akhlaghi/metastore">Metastore</a>. Metastore
                use a binary database file (which is
                called <code>.file-metadata</code>) to keep the
                modification dates of all the files under version
                control. This file is also under version control, but
                is hidden (because it shouldn't be modified by
                hand). During the project's configuration, Maneage
                installs to Git hooks to run Metastore 1) before
                making a commit to update its database with the file
                dates in a branch, and 2) after doing a checkout, to
                reset the file-dates after the checkout is complete
                and re-set the file dates back to what they were.</p>

                <p>In practice, Metastore should work almost fully
                invisibly within your project. The only place you
                might notice its presence is that you'll see
                <code>.file-metadata</code> in the list of
                modified/staged files (commonly after merging your
                branches). Since its a binary file, Git also won't
                show you the changed contents. In a merge, you can
                simply accept any changes with
                <code>git add -u</code>. But if Git is telling you
                that it has changed without a merge (for example if
                you started a commit, but canceled it in the middle),
                you can just do <code>git checkout
                .file-metadata</code> and set it back to its original
                state.</p>

                <h3>Summary</h3>

                <p>Based on the explanation above, some major design
                points you should have in mind are listed below.</p>

                <ul>
                    <li><p>Define
                        new <code>reproduce/analysis/make/XXXXXX.mk</code>
                        workhorse-Makefile(s) with good and
                        human-friendly name(s)
                        replacing <code>XXXXXX</code>.</p></li>

                    <li><p>Add <code>XXXXXX</code>, as a new line, to
                    the values in <code>makesrc</code> of the
                    top-level
                        <code>Makefile</code>.</p></li>

                    <li><p>Do not use any constant numbers (or
                        important names like filter names) in the
                        workhorse-Makefiles or paper's LaTeX
                        source. Define such constants as
                        logically-grouped, separate
                        configuration-Makefiles in
                        <code>reproduce/analysis/config/XXXXX.conf</code>. Then
                        set this configuration-Makefiles file as a
                        prerequisite to any rule that uses the
                        variable defined in it.</p></li>

                    <li><p>Through any number of intermediate
                        prerequisites, all processing steps should end
                        in (be a prerequisite
                        of) <code>$(mtexdir)/verify.tex</code>
                        (defined in
                        <code>reproduce/analysis/make/verify.mk</code>). <code>$(mtexdir)/verify.tex</code>
                        is the sole dependency
                        of <code>$(mtexdir)/project.tex</code>, which
                        is the bridge between the processing steps and
                        PDF-building steps of the project.</p></li>
                </ul>

		<p align="right">Next: <a href="about-customize.html">Customization checklist</a>, Previous: <a href="about-make.html">Why Make?</a>, Up: <a href="about.html">About</a> </p>





                <footer role="contentinfo" id="page-footer">
                  <ul>
                    <li><p>Maneage is currently based in the Centro de Estudios de Física del Cosmos de Aragón (CEFCA).</p></li>
                    <li><p>Address: CEFCA, Plaza San Juan 1, Planta 2, Teruel, Spain, 44001.</p></li>
                    <li><p>Contact: with <a href="https://savannah.nongnu.org/support/?func=additem&group=reproduce">this form</a>, or <a href="https://app.element.io/#/room/#maneage-general:matrix.org">#maneage-general:matrix.org</a>, or project PI (<a href="http://akhlaghi.org">Mohammad Akhlaghi</a>).</p></li>
                    <li><p>Copyright &copy; 2020-2022 Maneage volunteers</p></li>
		    <li>This page is distributed under GNU General Public License (<a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPL</a>).</li>
                  </ul>
                </footer>
</body>
</html>