aboutsummaryrefslogtreecommitdiff
path: root/about-architecture.html
blob: 915b45f2c7976c2118c469426f531a0c571dadb1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
<!DOCTYPE html>
<!-- Copyright notes are just below the head and before body -->

    <html lang="en-US">

        <!-- HTML Header -->
        <head>
            <!-- Title of the page. -->
            <title>Maneage -- Managing data lineage</title>

            <!-- Enable UTF-8 encoding to easily use non-ASCII charactes -->
            <meta charset="UTF-8">
            <meta http-equiv="Content-type" content="text/html; charset=UTF-8">

            <!-- Put logo beside the address bar -->
            <link rel="shortcut icon" href="./img/favicon.svg" />

            <!-- The viewport meta tag is placed mainly for mobile browsers
                that are pre-configured in different ways (for example setting the
                different widths for the page than the actual width of the device,
                or zooming to different values. Without this the CSS media
                solutions might not work properly on all mobile browsers.-->
                <meta name="viewport"
                      content="width=device-width, initial-scale=1">

                <!-- Basic styles -->
                <link rel="stylesheet" href="css/base.css" />
        </head>

        <!--
            Webpage of Maneage: a framework for managing data lineage

            Copyright (C) 2020, Pedram Ashofteh Ardakani <pedramardakani@pm.me>
            Copyright (C) 2020, Mohammad Akhlaghi <mohammad@akhlaghi.org>

            This file is part of Maneage. Maneage is free software: you can
            redistribute it and/or modify it under the terms of the GNU General
            Public License as published by the Free Software Foundation, either
            version 3 of the License, or (at your option) any later version.

            Maneage is distributed in the hope that it will be useful, but
            WITHOUT ANY WARRANTY; without even the implied warranty of
            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
            General Public License for more details. See
            <http://www.gnu.org/licenses/>.  -->

        <!-- Start the main body. -->
        <body>
            <div id="container">
                <header role="banner">
                    <!-- global navigation -->
                    <nav role="navigation" id="nav-hamburger-wrapper">
                        <input type="checkbox" id="nav-hamburger-input"/>
                        <label for="nav-hamburger-input">|||</label>
                        <div id="nav-hamburger-items" class="button">
                            <a href="index.html">Home</a>
                            <a href="about.html">About</a>
                            <a href="http://git.maneage.org/project.git/">Git</a>
                            <a href="tutorial.html">Tutorial</a>
                        </div>
                    </nav>
                </header>
                <div class="banner">
                    <div>
                        <a href="index.html"><img src="img/maneage-logo.svg" /></a>
                    </div>
                    <div>
                        <h1>Maneage</h1><h2>Managing Data Lineage</h2>
                        <p>Copyright &copy; 2018-2020 Mohammad Akhlaghi <a href="&#109;&#x61;&#x69;&#x6C;&#x74;&#x6F;:&#x6D;&#111;&#104;&#97;&#x6D;&#109;a&#x64;&#64;&#x61;&#107;&#x68;&#x6C;&#x61;&#x67;&#104;&#x69;.&#x6F;&#x72;&#103;">&#x6D;&#111;&#104;&#97;&#x6D;&#109;a&#x64;&#64;&#x61;&#107;&#x68;&#x6C;&#x61;&#x67;&#104;&#x69;.&#x6F;&#x72;&#103;</a><br />
                        Copyright &copy; 2020 Raul Infante-Sainz <a href="m&#x61;&#105;&#108;t&#111;:&#x69;&#x6E;&#x66;&#x61;&#x6E;&#116;&#101;&#115;&#97;&#x69;n&#122;&#64;&#103;&#x6D;&#x61;&#x69;&#x6C;&#x2E;&#x63;&#111;&#x6D;">&#x69;&#x6E;&#x66;&#x61;&#x6E;&#116;&#101;&#115;&#97;&#x69;n&#122;&#64;&#103;&#x6D;&#x61;&#x69;&#x6C;&#x2E;&#x63;&#111;&#x6D;</a><br />
                        <a href="#page-footer">License Conditions</a></p>
                    </div>
                </div>





		<hr />
		<p align="right">Next: <a href="about-customize.html">Customization checklist</a>, Previous: <a href="about-make.html">Why Make?</a>, Up: <a href="about.html">About</a> </p>

                <h1>Project architecture</h1>

                <p>In order to customize Maneage to your research, it is important to first
                understand its architecture so you can navigate your way in the directories
                and understand how to implement your research project within its framework:
                where to add new files and which existing files to modify for what
                purpose. But if this the first time you are using Maneage, before reading
                this theoretical discussion, please run Maneage once from scratch without
                any changes (described in <code>README.md</code>). You will see how it works (note that
                the configure step builds all necessary software, so it can take long, but
                you can continue reading while its working).</p>

                <p>The project has two top-level directories: <code>reproduce</code> and
                <code>tex</code>. <code>reproduce</code> hosts all the software building and analysis
                steps. <code>tex</code> contains all the final paper's components to be compiled into
                a PDF using LaTeX.</p>

                <p>The <code>reproduce</code> directory has two sub-directories: <code>software</code> and
                <code>analysis</code>. As the name says, the former contains all the instructions to
                download, build and install (independent of the host operating system) the
                necessary software (these are called by the <code>./project configure</code>
                command). The latter contains instructions on how to use those software to
                do your project's analysis.</p>

                <p>After it finishes, <code>./project configure</code> will create the following symbolic
                links in the project's top source directory: <code>.build</code> which points to the
                top build directory and <code>.local</code> for easy access to the custom built
                software installation directory. With these you can easily access the build
                directory and project-specific software from your top source directory. For
                example if you run <code>.local/bin/ls</code> you will be using the <code>ls</code> of Maneage,
                which is probably different from your system's <code>ls</code> (run them both with
                <code>--version</code> to check).</p>

                <p>Once the project is configured for your system, <code>./project make</code> will do
                the basic preparations and run the project's analysis with the custom
                version of software. The <code>project</code> script is just a wrapper, and with the
                <code>make</code> argument, it will first call <code>top-prepare.mk</code> and <code>top-make.mk</code>
                (both are in the <code>reproduce/analysis/make</code> directory).</p>

                <p>In terms of organization, <code>top-prepare.mk</code> and <code>top-make.mk</code> have an
                identical design, only minor differences. So, let's continue Maneage's
                architecture with <code>top-make.mk</code>. Once you understand that, you'll clearly
                understand <code>top-prepare.mk</code> also. These very high-level files are
                relatively short and heavily commented so hopefully the descriptions in
                each comment will be enough to understand the general details. As you read
                this section, please also look at the contents of the mentioned files and
                directories to fully understand what is going on.</p>

                <p>Before starting to look into the top <code>top-make.mk</code>, it is important to
                recall that Make defines dependencies by files. Therefore, the
                input/prerequisite and output of every step/rule must be a file. Also
                recall that Make will use the modification date of the prerequisite(s) and
                target files to see if the target must be re-built or not. Therefore during
                the processing, <em>many</em> intermediate files will be created (see the tips
                section below on a good strategy to deal with large/huge files).</p>

                <p>To keep the source and (intermediate) built files separate, the user <em>must</em>
                define a top-level build directory variable (or <code>$(BDIR)</code>) to host all the
                intermediate files (you defined it during <code>./project configure</code>). This
                directory doesn't need to be version controlled or even synchronized, or
                backed-up in other servers: its contents are all products, and can be
                easily re-created any time. As you define targets for your new rules, it is
                thus important to place them all under sub-directories of <code>$(BDIR)</code>. As
                mentioned above, you always have fast access to this "build"-directory with
                the <code>.build</code> symbolic link. Also, beware to <em>never</em> make any manual change
                in the files of the build-directory, just delete them (so they are
                re-built).</p>

                <p>In this architecture, we have two types of Makefiles that are loaded into
                the top <code>Makefile</code>: <em>configuration-Makefiles</em> (only independent
                variables/configurations) and <em>workhorse-Makefiles</em> (Makefiles that
                actually contain analysis/processing rules).</p>

                <p>The configuration-Makefiles are those that satisfy these two wildcards:
                <code>reproduce/software/config/*.conf</code> (for building the necessary software
                when you run <code>./project configure</code>) and <code>reproduce/analysis/config/*.conf</code>
                (for the high-level analysis, when you run <code>./project make</code>). These
                Makefiles don't actually have any rules, they just have values for various
                free parameters throughout the configuration or analysis. Open a few of
                them to see for yourself. These Makefiles must only contain raw Make
                variables (project configurations). By "raw" we mean that the Make
                variables in these files must not depend on variables in any other
                configuration-Makefile. This is because we don't want to assume any order
                in reading them. It is also very important to <em>not</em> define any rule, or
                other Make construct, in these configuration-Makefiles.</p>

                <p>Following this rule-of-thumb enables you to set these configure-Makefiles
                as a prerequisite to any target that depends on their variable
                values. Therefore, if you change any of their values, all targets that
                depend on those values will be re-built. This is very convenient as your
                project scales up and gets more complex.</p>

                <p>The workhorse-Makefiles are those satisfying this wildcard
                <code>reproduce/software/make/*.mk</code> and <code>reproduce/analysis/make/*.mk</code>. They
                contain the details of the processing steps (Makefiles containing
                rules). Therefore, in this phase <em>order is important</em>, because the
                prerequisites of most rules will be the targets of other rules that will be
                defined prior to them (not a fixed name like <code>paper.pdf</code>). The lower-level
                rules must be imported into Make before the higher-level ones.</p>

                <p>All processing steps are assumed to ultimately (usually after many rules)
                end up in some number, image, figure, or table that will be included in the
                paper. The writing of these results into the final report/paper is managed
                through separate LaTeX files that only contain macros (a name given to a
                number/string to be used in the LaTeX source, which will be replaced when
                compiling it to the final PDF). So the last target in a workhorse-Makefile
                is a <code>.tex</code> file (with the same base-name as the Makefile, but in
                <code>$(BDIR)/tex/macros</code>). As a result, if the targets in a workhorse-Makefile
                aren't directly a prerequisite of other workhorse-Makefile targets, they
                can be a prerequisite of that intermediate LaTeX macro file and thus be
                called when necessary. Otherwise, they will be ignored by Make.</p>

                <p>Maneage also has a mode to share the build directory between several
                users of a Unix group (when working on large computer clusters). In this
                scenario, each user can have their own cloned project source, but share the
                large built files between each other. To do this, it is necessary for all
                built files to give full permission to group members while not allowing any
                other users access to the contents. Therefore the <code>./project configure</code> and
                <code>./project make</code> steps must be called with special conditions which are
                managed in the <code>--group</code> option.</p>

                <p>Let's see how this design is implemented. Please open and inspect
                <code>top-make.mk</code> it as we go along here. The first step (un-commented line) is
                to import the local configuration (your answers to the questions of
                <code>./project configure</code>). They are defined in the configuration-Makefile
                <code>reproduce/software/config/LOCAL.conf</code> which was also built by <code>./project
                    configure</code> (based on the <code>LOCAL.conf.in</code> template of the same directory).</p>

                <p>The next non-commented set of the top <code>Makefile</code> defines the ultimate
                target of the whole project (<code>paper.pdf</code>). But to avoid mistakes, a sanity
                check is necessary to see if Make is being run with the same group settings
                as the configure script (for example when the project is configured for
                group access using the <code>./for-group</code> script, but Make isn't). Therefore we
                use a Make conditional to define the <code>all</code> target based on the group
                permissions.</p>

                <p>Having defined the top/ultimate target, our next step is to include all the
                other necessary Makefiles. However, order matters in the importing of
                workhorse-Makefiles and each must also have a TeX macro file with the same
                base name (without a suffix). Therefore, the next step in the top-level
                Makefile is to define the <code>makesrc</code> variable to keep the base names
                (without a <code>.mk</code> suffix) of the workhorse-Makefiles that must be imported,
                in the proper order.</p>

                <p>Finally, we import all the necessary remaining Makefiles: 1) All the
                analysis configuration-Makefiles with a wildcard. 2) The software
                configuration-Makefile that contains their version (just in case its
                necessary). 3) All workhorse-Makefiles in the proper order using a Make
                <code>foreach</code> loop.</p>

                <p>In short, to keep things modular, readable and manageable, follow these
                recommendations: 1) Set clear-to-understand names for the
                configuration-Makefiles, and workhorse-Makefiles, 2) Only import other
                Makefiles from top Makefile. These will let you know/remember generally
                which step you are taking before or after another. Projects will scale up
                very fast. Thus if you don't start and continue with a clean and robust
                convention like this, in the end it will become very dirty and hard to
                manage/understand (even for yourself). As a general rule of thumb, break
                your rules into as many logically-similar but independent steps as
                possible.</p>

                <p>The <code>reproduce/analysis/make/paper.mk</code> Makefile must be the final Makefile
                that is included. This workhorse Makefile ends with the rule to build
                <code>paper.pdf</code> (final target of the whole project). If you look in it, you
                will notice that this Makefile starts with a rule to create
                <code>$(mtexdir)/project.tex</code> (<code>mtexdir</code> is just a shorthand name for
                <code>$(BDIR)/tex/macros</code> mentioned before). As you see, the only dependency of
                <code>$(mtexdir)/project.tex</code> is <code>$(mtexdir)/verify.tex</code> (which is the last
                analysis step: it verifies all the generated results).  Therefore,
                <code>$(mtexdir)/project.tex</code> is <em>the connection</em> between the
                processing/analysis steps of the project, and the steps to build the final
                PDF.</p>

                <p>During the research, it often happens that you want to test a step that is
                not a prerequisite of any higher-level operation. In such cases, you can
                (temporarily) define that processing as a rule in the most relevant
                workhorse-Makefile and set its target as a prerequisite of its TeX
                macro. If your test gives a promising result and you want to include it in
                your research, set it as prerequisites to other rules and remove it from
                the list of prerequisites for TeX macro file. In fact, this is how a
                project is designed to grow in this framework.</p>

                <h2>File modification dates (meta data)</h2>

                <p>While Git does an excellent job at keeping a history of the contents of
                files, it makes no effort in keeping the file meta data, and in particular
                the dates of files. Therefore when you checkout to a different branch,
                files that are re-written by Git will have a newer date than the other
                project files. However, file dates are important in the current design of
                Maneage: Make checks the dates of the prerequisite files and target files
                to see if the target should be re-built.</p>

                <p>To fix this problem, for Maneage we use a forked version of
                <a href="https://github.com/mohammad-akhlaghi/metastore">Metastore</a>. Metastore use
                a binary database file (which is called <code>.file-metadata</code>) to keep the
                modification dates of all the files under version control. This file is
                also under version control, but is hidden (because it shouldn't be modified
                by hand). During the project's configuration, Maneage installs to Git hooks
                to run Metastore 1) before making a commit to update its database with the
                file dates in a branch, and 2) after doing a checkout, to reset the
                file-dates after the checkout is complete and re-set the file dates back to
                what they were.</p>

                <p>In practice, Metastore should work almost fully invisibly within your
                project. The only place you might notice its presence is that you'll see
                <code>.file-metadata</code> in the list of modified/staged files (commonly after
                merging your branches). Since its a binary file, Git also won't show you
                the changed contents. In a merge, you can simply accept any changes with
                <code>git add -u</code>. But if Git is telling you that it has changed without a merge
                (for example if you started a commit, but canceled it in the middle), you
                can just do <code>git checkout .file-metadata</code> and set it back to its original
                state.</p>

                <h2>Summary</h2>

                <p>Based on the explanation above, some major design points you should have in
                mind are listed below.</p>

                <ul>
                    <li><p>Define new <code>reproduce/analysis/make/XXXXXX.mk</code> workhorse-Makefile(s)
                        with good and human-friendly name(s) replacing <code>XXXXXX</code>.</p></li>
                    <li><p>Add <code>XXXXXX</code>, as a new line, to the values in <code>makesrc</code> of the top-level
                        <code>Makefile</code>.</p></li>
                    <li><p>Do not use any constant numbers (or important names like filter names)
                        in the workhorse-Makefiles or paper's LaTeX source. Define such
                        constants as logically-grouped, separate configuration-Makefiles in
                        <code>reproduce/analysis/config/XXXXX.conf</code>. Then set this
                        configuration-Makefiles file as a prerequisite to any rule that uses
                        the variable defined in it.</p></li>
                    <li><p>Through any number of intermediate prerequisites, all processing steps
                        should end in (be a prerequisite of) <code>$(mtexdir)/verify.tex</code> (defined in
                        <code>reproduce/analysis/make/verify.mk</code>). <code>$(mtexdir)/verify.tex</code> is the sole
                        dependency of <code>$(mtexdir)/project.tex</code>, which is the bridge between the
                        processing steps and PDF-building steps of the project.</p></li>
                </ul>

		<p align="right">Next: <a href="about-customize.html">Customization checklist</a>, Previous: <a href="about-make.html">Why Make?</a>, Up: <a href="about.html">About</a> </p>





                <footer role="contentinfo" id="page-footer">
                <h2>Copyright information</h2>

                <p>This file is part of Maneage's core: <a href="https://git.maneage.org/project.git">https://git.maneage.org/project.git</a></p>

                <p>Maneage is free software: you can redistribute it and/or modify it under
                the terms of the GNU General Public License as published by the Free
                Software Foundation, either version 3 of the License, or (at your option)
                any later version.</p>

                <p>Maneage is distributed in the hope that it will be useful, but WITHOUT ANY
                WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
                FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
                details.</p>

                <p>You should have received a copy of the GNU General Public License along
                with Maneage.  If not, see <a href="https://www.gnu.org/licenses/">https://www.gnu.org/licenses/</a>.</p>
                <ul>
                    <li><p>Maneage is currently based in the Instituto de Astrofísica de Canarias (IAC).</p></li>
                    <li><p>Address: IAC, Calle Vía Láctea, s/n, E38205 - La Laguna (Tenerife), Spain.</p></li>
                    <!-- The people page will be added later
                    <li><p>People [page will be added later]</p></li>
                    -->
                    <li><p>Contact: with <a href="https://savannah.nongnu.org/support/?func=additem&group=reproduce">this form.</a></p></li>
                    <li><p>Copyright &copy; 2020 Maneage volunteers</p></li>
                    <li><p>All logos are copyrighted by the respective institutions</p></li>
                </ul>
                </footer>
            </div>
        </body>